@@ -262,6 +262,7 @@ endif | |||
lapack-test : | |||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||
make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib | |||
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
@@ -291,4 +292,6 @@ endif | |||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
@rm -f *.grd Makefile.conf_last config_last.h | |||
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
@rm -f $(NETLIB_LAPACK_DIR)/tmglib.a | |||
@echo Done. |
@@ -350,7 +350,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | |||
SLAPACKOBJS = \ | |||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | |||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | |||
slauum.$(SUFFIX) strti2.$(SUFFIX) | |||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) | |||
#DLAPACKOBJS = \ | |||
@@ -361,7 +361,7 @@ SLAPACKOBJS = \ | |||
DLAPACKOBJS = \ | |||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | |||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | |||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) | |||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) | |||
QLAPACKOBJS = \ | |||
@@ -377,7 +377,7 @@ QLAPACKOBJS = \ | |||
CLAPACKOBJS = \ | |||
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | |||
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | |||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) | |||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) | |||
#ZLAPACKOBJS = \ | |||
@@ -388,7 +388,7 @@ CLAPACKOBJS = \ | |||
ZLAPACKOBJS = \ | |||
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | |||
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | |||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) | |||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) | |||
@@ -1883,19 +1883,19 @@ ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c | |||
xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c | |||
strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c | |||
dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c | |||
ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c | |||
ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c | |||
@@ -147,7 +147,7 @@ SLASRC = \ | |||
stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ | |||
stptrs.o \ | |||
strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ | |||
strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||
strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||
slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ | |||
stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ | |||
sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ | |||
@@ -225,7 +225,7 @@ CLASRC = \ | |||
ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ | |||
ctprfs.o ctptri.o \ | |||
ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ | |||
ctrsyl.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||
ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||
cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ | |||
cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ | |||
cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ | |||
@@ -307,7 +307,7 @@ DLASRC = \ | |||
dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ | |||
dtptrs.o \ | |||
dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ | |||
dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||
dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||
dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ | |||
dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ | |||
dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ | |||
@@ -387,7 +387,7 @@ ZLASRC = \ | |||
ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ | |||
ztprfs.o ztptri.o \ | |||
ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ | |||
ztrsyl.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||
ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||
zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ | |||
zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ | |||
zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ | |||
@@ -2,7 +2,7 @@ TOPDIR = .. | |||
include ../Makefile.system | |||
#SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | |||
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 | |||
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri | |||
FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | |||
@@ -1,190 +1,113 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
/*************************************************************************** | |||
* Copyright (c) 2013, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
/************************************************************************************** | |||
* 2014/05/22 Saar | |||
* TEST double precision unblocked : OK | |||
* 2014/05/23 Saar | |||
* TEST double precision blocked: OK | |||
* TEST single precision blocked: OK | |||
**************************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
static FLOAT dp1 = 1.; | |||
static FLOAT dm1 = -1.; | |||
// static FLOAT dp1 = 1.; | |||
// static FLOAT dm1 = -1.; | |||
#ifdef UNIT | |||
#define TRTI2 TRTI2_LU | |||
#define TRTI2 TRTI2_LU | |||
#define TRMM TRMM_LNLU | |||
#define TRSM TRSM_RNLU | |||
#else | |||
#define TRTI2 TRTI2_LN | |||
#endif | |||
#if 0 | |||
#undef GEMM_P | |||
#undef GEMM_Q | |||
#undef GEMM_R | |||
#define GEMM_P 8 | |||
#define GEMM_Q 20 | |||
#define GEMM_R 64 | |||
#define TRTI2 TRTI2_LN | |||
#define TRMM TRMM_LNLN | |||
#define TRSM TRSM_RNLN | |||
#endif | |||
#define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
BLASLONG n, lda; | |||
BLASLONG j, n, lda; | |||
FLOAT *a; | |||
BLASLONG i, is, min_i, start_i; | |||
BLASLONG ls, min_l; | |||
BLASLONG bk; | |||
BLASLONG blocking; | |||
BLASLONG range_N[2]; | |||
// BLASLONG info=0; | |||
BLASLONG jb; | |||
BLASLONG NB; | |||
BLASLONG start_j; | |||
FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||
FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
+ GEMM_OFFSET_A); | |||
FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
+ GEMM_OFFSET_B); | |||
FLOAT beta_plus[2] = { ONE, ZERO}; | |||
FLOAT beta_minus[2] = {-ONE, ZERO}; | |||
n = args -> n; | |||
a = (FLOAT *)args -> a; | |||
lda = args -> lda; | |||
if (range_n) { | |||
n = range_n[1] - range_n[0]; | |||
a += range_n[0] * (lda + 1) * COMPSIZE; | |||
} | |||
NB = GEMM_Q; | |||
if (n <= DTB_ENTRIES) { | |||
if (n < NB) { | |||
TRTI2(args, NULL, range_n, sa, sb, 0); | |||
return 0; | |||
} | |||
blocking = GEMM_Q; | |||
if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||
start_i = 0; | |||
while (start_i < n) start_i += blocking; | |||
start_i -= blocking; | |||
for (i = start_i; i >= 0; i -= blocking) { | |||
bk = MIN(blocking, n - i); | |||
if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||
if (!range_n) { | |||
range_N[0] = i; | |||
range_N[1] = i + bk; | |||
} else { | |||
range_N[0] = range_n[0] + i; | |||
range_N[1] = range_n[0] + i + bk; | |||
} | |||
CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||
if (i > 0) { | |||
TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||
for (ls = 0; ls < i; ls += REAL_GEMM_R) { | |||
min_l = i - ls; | |||
if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||
GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||
if (n - bk - i > 0) { | |||
for (is = i + bk; is < n; is += GEMM_P) { | |||
min_i = n - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
if (ls == 0) { | |||
NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa, sa_trsm, | |||
a + (is + i * lda) * COMPSIZE, lda, 0); | |||
} else { | |||
GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
} | |||
GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa, sb_gemm, | |||
a + (is + ls * lda) * COMPSIZE, lda); | |||
} | |||
} | |||
for (is = 0; is < bk; is += GEMM_P) { | |||
min_i = bk - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
TRMM_KERNEL_LT(min_i, min_l, bk, dp1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||
a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||
} | |||
} | |||
} else { | |||
if (n - bk - i > 0) { | |||
for (is = 0; is < n - bk - i; is += GEMM_P) { | |||
min_i = n - bk - i - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); | |||
TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa, sa_trsm, | |||
a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); | |||
} | |||
} | |||
} | |||
} | |||
lda = args -> lda; | |||
a = (FLOAT *) args -> a; | |||
args -> ldb = lda; | |||
args -> ldc = lda; | |||
args -> alpha = NULL; | |||
start_j = 0; | |||
while (start_j < n) start_j += NB; | |||
start_j -= NB; | |||
for (j = start_j ; j >=0 ; j-= NB) | |||
{ | |||
jb = n - j; | |||
if ( jb > NB ) jb = NB; | |||
args -> n = jb; | |||
args -> m = n-j-jb; | |||
args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE]; | |||
args -> b = &a[(j+jb+j*lda) * COMPSIZE]; | |||
args -> beta = beta_plus; | |||
TRMM(args, NULL, NULL, sa, sb, 0); | |||
args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
args -> beta = beta_minus; | |||
TRSM(args, NULL, NULL, sa, sb, 0); | |||
args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
TRTI2(args, NULL, range_n, sa, sb, 0); | |||
} | |||
return 0; | |||
} |
@@ -1,46 +1,44 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
/*************************************************************************** | |||
* Copyright (c) 2013, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
/************************************************************************************** | |||
* 2014/05/22 Saar | |||
* TEST double precision unblocked : OK | |||
* TEST double precision blocked : OK | |||
* 2014/05/23 | |||
* TEST single precision blocked : OK | |||
* | |||
**************************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
static FLOAT dp1 = 1.; | |||
static FLOAT dm1 = -1.; | |||
// static FLOAT dp1 = 1.; | |||
// static FLOAT dm1 = -1.; | |||
#ifdef UNIT | |||
#define TRTI2 TRTI2_UU | |||
@@ -48,152 +46,66 @@ static FLOAT dm1 = -1.; | |||
#define TRTI2 TRTI2_UN | |||
#endif | |||
#if 0 | |||
#undef GEMM_P | |||
#undef GEMM_Q | |||
#undef GEMM_R | |||
#define GEMM_P 8 | |||
#define GEMM_Q 20 | |||
#define GEMM_R 64 | |||
#ifdef UNIT | |||
#define TRMM TRMM_LNUU | |||
#define TRSM TRSM_RNUU | |||
#else | |||
#define TRMM TRMM_LNUN | |||
#define TRSM TRSM_RNUN | |||
#endif | |||
#define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
BLASLONG n, lda; | |||
BLASLONG j, n, lda; | |||
FLOAT *a; | |||
BLASLONG i, is, min_i, start_is; | |||
BLASLONG ls, min_l; | |||
BLASLONG bk; | |||
BLASLONG blocking; | |||
BLASLONG range_N[2]; | |||
// BLASLONG info=0; | |||
BLASLONG jb; | |||
BLASLONG NB; | |||
FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||
FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
+ GEMM_OFFSET_A); | |||
FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||
+ GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
+ GEMM_OFFSET_B); | |||
FLOAT beta_plus[2] = { ONE, ZERO}; | |||
FLOAT beta_minus[2] = {-ONE, ZERO}; | |||
n = args -> n; | |||
a = (FLOAT *)args -> a; | |||
lda = args -> lda; | |||
if (range_n) { | |||
n = range_n[1] - range_n[0]; | |||
a += range_n[0] * (lda + 1) * COMPSIZE; | |||
} | |||
NB = GEMM_Q; | |||
if (n <= DTB_ENTRIES) { | |||
if (n <= NB) { | |||
TRTI2(args, NULL, range_n, sa, sb, 0); | |||
return 0; | |||
} | |||
blocking = GEMM_Q; | |||
if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||
for (i = 0; i < n; i += blocking) { | |||
bk = MIN(blocking, n - i); | |||
if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||
if (!range_n) { | |||
range_N[0] = i; | |||
range_N[1] = i + bk; | |||
} else { | |||
range_N[0] = range_n[0] + i; | |||
range_N[1] = range_n[0] + i + bk; | |||
} | |||
CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||
if (n -bk - i > 0) { | |||
TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||
for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { | |||
min_l = n - ls; | |||
if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||
GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||
if (i > 0) { | |||
for (is = 0; is < i; is += GEMM_P) { | |||
min_i = i - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
if (ls == i + bk) { | |||
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
GEMM_BETA(min_i, bk, 0, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa, sa_trsm, | |||
a + (is + i * lda) * COMPSIZE, lda, 0); | |||
} else { | |||
GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
} | |||
GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa, sb_gemm, | |||
a + (is + ls * lda) * COMPSIZE, lda); | |||
} | |||
} | |||
start_is = 0; | |||
while (start_is < bk) start_is += GEMM_P; | |||
start_is -= GEMM_P; | |||
for (is = 0; is < bk; is += GEMM_P) { | |||
min_i = bk - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
TRMM_KERNEL_LN(min_i, min_l, bk, dp1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||
a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||
} | |||
} | |||
} else { | |||
if (i > 0) { | |||
for (is = 0; is < i; is += GEMM_P) { | |||
min_i = i - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
GEMM_BETA(min_i, bk, 0, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
lda = args -> lda; | |||
a = (FLOAT *) args -> a; | |||
args -> ldb = lda; | |||
args -> ldc = lda; | |||
args -> alpha = NULL; | |||
TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
sa, sa_trsm, | |||
a + (is + i * lda) * COMPSIZE, lda, 0); | |||
} | |||
} | |||
} | |||
} | |||
for (j = 0; j < n; j += NB) | |||
{ | |||
jb = n - j; | |||
if ( jb > NB ) jb = NB; | |||
args -> n = jb; | |||
args -> m = j; | |||
args -> a = &a[0]; | |||
args -> b = &a[(j*lda) * COMPSIZE]; | |||
args -> beta = beta_plus; | |||
TRMM(args, NULL, NULL, sa, sb, 0); | |||
args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
args -> beta = beta_minus; | |||
TRSM(args, NULL, NULL, sa, sb, 0); | |||
args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
TRTI2(args, NULL, range_n, sa, sb, 0); | |||
} | |||
return 0; | |||
} |