| @@ -262,6 +262,7 @@ endif | |||
| lapack-test : | |||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | |||
| make -j 1 -C $(NETLIB_LAPACK_DIR) tmglib | |||
| make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
| (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
| @@ -291,4 +292,6 @@ endif | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @rm -f *.grd Makefile.conf_last config_last.h | |||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
| @rm -f $(NETLIB_LAPACK_DIR)/tmglib.a | |||
| @echo Done. | |||
| @@ -350,7 +350,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | |||
| SLAPACKOBJS = \ | |||
| sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | |||
| spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | |||
| slauum.$(SUFFIX) strti2.$(SUFFIX) | |||
| slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) | |||
| #DLAPACKOBJS = \ | |||
| @@ -361,7 +361,7 @@ SLAPACKOBJS = \ | |||
| DLAPACKOBJS = \ | |||
| dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | |||
| dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | |||
| dlauum.$(SUFFIX) dtrti2.$(SUFFIX) | |||
| dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) | |||
| QLAPACKOBJS = \ | |||
| @@ -377,7 +377,7 @@ QLAPACKOBJS = \ | |||
| CLAPACKOBJS = \ | |||
| cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ | |||
| cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ | |||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) | |||
| clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) | |||
| #ZLAPACKOBJS = \ | |||
| @@ -388,7 +388,7 @@ CLAPACKOBJS = \ | |||
| ZLAPACKOBJS = \ | |||
| zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ | |||
| zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ | |||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) | |||
| zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) | |||
| @@ -1883,19 +1883,19 @@ ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c | |||
| xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c | |||
| strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c | |||
| dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c | |||
| ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c | |||
| ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c | |||
| @@ -147,7 +147,7 @@ SLASRC = \ | |||
| stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ | |||
| stptrs.o \ | |||
| strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ | |||
| strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||
| strtrs.o stzrqf.o stzrzf.o sstemr.o \ | |||
| slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ | |||
| stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ | |||
| sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ | |||
| @@ -225,7 +225,7 @@ CLASRC = \ | |||
| ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ | |||
| ctprfs.o ctptri.o \ | |||
| ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ | |||
| ctrsyl.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||
| ctrsyl.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ | |||
| cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ | |||
| cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ | |||
| cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ | |||
| @@ -307,7 +307,7 @@ DLASRC = \ | |||
| dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ | |||
| dtptrs.o \ | |||
| dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ | |||
| dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||
| dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ | |||
| dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ | |||
| dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ | |||
| dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ | |||
| @@ -387,7 +387,7 @@ ZLASRC = \ | |||
| ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ | |||
| ztprfs.o ztptri.o \ | |||
| ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ | |||
| ztrsyl.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||
| ztrsyl.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ | |||
| zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ | |||
| zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ | |||
| zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ | |||
| @@ -2,7 +2,7 @@ TOPDIR = .. | |||
| include ../Makefile.system | |||
| #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | |||
| SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 | |||
| SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri | |||
| FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | |||
| @@ -1,190 +1,113 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2013, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2014/05/22 Saar | |||
| * TEST double precision unblocked : OK | |||
| * 2014/05/23 Saar | |||
| * TEST double precision blocked: OK | |||
| * TEST single precision blocked: OK | |||
| **************************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| static FLOAT dp1 = 1.; | |||
| static FLOAT dm1 = -1.; | |||
| // static FLOAT dp1 = 1.; | |||
| // static FLOAT dm1 = -1.; | |||
| #ifdef UNIT | |||
| #define TRTI2 TRTI2_LU | |||
| #define TRTI2 TRTI2_LU | |||
| #define TRMM TRMM_LNLU | |||
| #define TRSM TRSM_RNLU | |||
| #else | |||
| #define TRTI2 TRTI2_LN | |||
| #endif | |||
| #if 0 | |||
| #undef GEMM_P | |||
| #undef GEMM_Q | |||
| #undef GEMM_R | |||
| #define GEMM_P 8 | |||
| #define GEMM_Q 20 | |||
| #define GEMM_R 64 | |||
| #define TRTI2 TRTI2_LN | |||
| #define TRMM TRMM_LNLN | |||
| #define TRSM TRSM_RNLN | |||
| #endif | |||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
| #define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
| BLASLONG n, lda; | |||
| BLASLONG j, n, lda; | |||
| FLOAT *a; | |||
| BLASLONG i, is, min_i, start_i; | |||
| BLASLONG ls, min_l; | |||
| BLASLONG bk; | |||
| BLASLONG blocking; | |||
| BLASLONG range_N[2]; | |||
| // BLASLONG info=0; | |||
| BLASLONG jb; | |||
| BLASLONG NB; | |||
| BLASLONG start_j; | |||
| FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||
| FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_A); | |||
| FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_B); | |||
| FLOAT beta_plus[2] = { ONE, ZERO}; | |||
| FLOAT beta_minus[2] = {-ONE, ZERO}; | |||
| n = args -> n; | |||
| a = (FLOAT *)args -> a; | |||
| lda = args -> lda; | |||
| if (range_n) { | |||
| n = range_n[1] - range_n[0]; | |||
| a += range_n[0] * (lda + 1) * COMPSIZE; | |||
| } | |||
| NB = GEMM_Q; | |||
| if (n <= DTB_ENTRIES) { | |||
| if (n < NB) { | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| blocking = GEMM_Q; | |||
| if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||
| start_i = 0; | |||
| while (start_i < n) start_i += blocking; | |||
| start_i -= blocking; | |||
| for (i = start_i; i >= 0; i -= blocking) { | |||
| bk = MIN(blocking, n - i); | |||
| if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||
| if (!range_n) { | |||
| range_N[0] = i; | |||
| range_N[1] = i + bk; | |||
| } else { | |||
| range_N[0] = range_n[0] + i; | |||
| range_N[1] = range_n[0] + i + bk; | |||
| } | |||
| CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||
| if (i > 0) { | |||
| TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||
| for (ls = 0; ls < i; ls += REAL_GEMM_R) { | |||
| min_l = i - ls; | |||
| if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||
| GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||
| if (n - bk - i > 0) { | |||
| for (is = i + bk; is < n; is += GEMM_P) { | |||
| min_i = n - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| if (ls == 0) { | |||
| NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||
| } else { | |||
| GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| } | |||
| GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sb_gemm, | |||
| a + (is + ls * lda) * COMPSIZE, lda); | |||
| } | |||
| } | |||
| for (is = 0; is < bk; is += GEMM_P) { | |||
| min_i = bk - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| TRMM_KERNEL_LT(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||
| a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||
| } | |||
| } | |||
| } else { | |||
| if (n - bk - i > 0) { | |||
| for (is = 0; is < n - bk - i; is += GEMM_P) { | |||
| min_i = n - bk - i - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); | |||
| TRSM_KERNEL_RT(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| lda = args -> lda; | |||
| a = (FLOAT *) args -> a; | |||
| args -> ldb = lda; | |||
| args -> ldc = lda; | |||
| args -> alpha = NULL; | |||
| start_j = 0; | |||
| while (start_j < n) start_j += NB; | |||
| start_j -= NB; | |||
| for (j = start_j ; j >=0 ; j-= NB) | |||
| { | |||
| jb = n - j; | |||
| if ( jb > NB ) jb = NB; | |||
| args -> n = jb; | |||
| args -> m = n-j-jb; | |||
| args -> a = &a[(j+jb+(j+jb)*lda) * COMPSIZE]; | |||
| args -> b = &a[(j+jb+j*lda) * COMPSIZE]; | |||
| args -> beta = beta_plus; | |||
| TRMM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| args -> beta = beta_minus; | |||
| TRSM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,46 +1,44 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2013, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2014/05/22 Saar | |||
| * TEST double precision unblocked : OK | |||
| * TEST double precision blocked : OK | |||
| * 2014/05/23 | |||
| * TEST single precision blocked : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| static FLOAT dp1 = 1.; | |||
| static FLOAT dm1 = -1.; | |||
| // static FLOAT dp1 = 1.; | |||
| // static FLOAT dm1 = -1.; | |||
| #ifdef UNIT | |||
| #define TRTI2 TRTI2_UU | |||
| @@ -48,152 +46,66 @@ static FLOAT dm1 = -1.; | |||
| #define TRTI2 TRTI2_UN | |||
| #endif | |||
| #if 0 | |||
| #undef GEMM_P | |||
| #undef GEMM_Q | |||
| #undef GEMM_R | |||
| #define GEMM_P 8 | |||
| #define GEMM_Q 20 | |||
| #define GEMM_R 64 | |||
| #ifdef UNIT | |||
| #define TRMM TRMM_LNUU | |||
| #define TRSM TRSM_RNUU | |||
| #else | |||
| #define TRMM TRMM_LNUN | |||
| #define TRSM TRSM_RNUN | |||
| #endif | |||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
| #define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) | |||
| blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { | |||
| BLASLONG n, lda; | |||
| BLASLONG j, n, lda; | |||
| FLOAT *a; | |||
| BLASLONG i, is, min_i, start_is; | |||
| BLASLONG ls, min_l; | |||
| BLASLONG bk; | |||
| BLASLONG blocking; | |||
| BLASLONG range_N[2]; | |||
| // BLASLONG info=0; | |||
| BLASLONG jb; | |||
| BLASLONG NB; | |||
| FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); | |||
| FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_A); | |||
| FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm | |||
| + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) | |||
| + GEMM_OFFSET_B); | |||
| FLOAT beta_plus[2] = { ONE, ZERO}; | |||
| FLOAT beta_minus[2] = {-ONE, ZERO}; | |||
| n = args -> n; | |||
| a = (FLOAT *)args -> a; | |||
| lda = args -> lda; | |||
| if (range_n) { | |||
| n = range_n[1] - range_n[0]; | |||
| a += range_n[0] * (lda + 1) * COMPSIZE; | |||
| } | |||
| NB = GEMM_Q; | |||
| if (n <= DTB_ENTRIES) { | |||
| if (n <= NB) { | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| return 0; | |||
| } | |||
| blocking = GEMM_Q; | |||
| if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; | |||
| for (i = 0; i < n; i += blocking) { | |||
| bk = MIN(blocking, n - i); | |||
| if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); | |||
| if (!range_n) { | |||
| range_N[0] = i; | |||
| range_N[1] = i + bk; | |||
| } else { | |||
| range_N[0] = range_n[0] + i; | |||
| range_N[1] = range_n[0] + i + bk; | |||
| } | |||
| CNAME(args, NULL, range_N, sa, sa_trmm, 0); | |||
| if (n -bk - i > 0) { | |||
| TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); | |||
| for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { | |||
| min_l = n - ls; | |||
| if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; | |||
| GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); | |||
| if (i > 0) { | |||
| for (is = 0; is < i; is += GEMM_P) { | |||
| min_i = i - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| if (ls == i + bk) { | |||
| //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| GEMM_BETA(min_i, bk, 0, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
| TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||
| } else { | |||
| GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| } | |||
| GEMM_KERNEL_N(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sb_gemm, | |||
| a + (is + ls * lda) * COMPSIZE, lda); | |||
| } | |||
| } | |||
| start_is = 0; | |||
| while (start_is < bk) start_is += GEMM_P; | |||
| start_is -= GEMM_P; | |||
| for (is = 0; is < bk; is += GEMM_P) { | |||
| min_i = bk - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| TRMM_KERNEL_LN(min_i, min_l, bk, dp1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa_trmm + is * bk * COMPSIZE, sb_gemm, | |||
| a + (i + is + ls * lda) * COMPSIZE, lda, is); | |||
| } | |||
| } | |||
| } else { | |||
| if (i > 0) { | |||
| for (is = 0; is < i; is += GEMM_P) { | |||
| min_i = i - is; | |||
| if (min_i > GEMM_P) min_i = GEMM_P; | |||
| //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
| GEMM_BETA(min_i, bk, 0, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
| lda = args -> lda; | |||
| a = (FLOAT *) args -> a; | |||
| args -> ldb = lda; | |||
| args -> ldc = lda; | |||
| args -> alpha = NULL; | |||
| TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| sa, sa_trsm, | |||
| a + (is + i * lda) * COMPSIZE, lda, 0); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| for (j = 0; j < n; j += NB) | |||
| { | |||
| jb = n - j; | |||
| if ( jb > NB ) jb = NB; | |||
| args -> n = jb; | |||
| args -> m = j; | |||
| args -> a = &a[0]; | |||
| args -> b = &a[(j*lda) * COMPSIZE]; | |||
| args -> beta = beta_plus; | |||
| TRMM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| args -> beta = beta_minus; | |||
| TRSM(args, NULL, NULL, sa, sb, 0); | |||
| args -> a = &a[(j+j*lda) * COMPSIZE]; | |||
| TRTI2(args, NULL, range_n, sa, sb, 0); | |||
| } | |||
| return 0; | |||
| } | |||