From c167a3d6f41ed2f5680c2a72846b4c9b5d416543 Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Mon, 16 Apr 2018 12:34:43 -0700 Subject: [PATCH 001/121] Added RISCV build --- Makefile.riscv64 | 0 Makefile.system | 4 + c_check | 1 + common.h | 5 ++ common_riscv64.h | 93 +++++++++++++++++++ cpuid_riscv64.c | 111 +++++++++++++++++++++++ ctest.c | 4 + getarch.c | 19 ++++ kernel/riscv64/KERNEL | 149 +++++++++++++++++++++++++++++++ kernel/riscv64/amax.c | 75 ++++++++++++++++ kernel/riscv64/amin.c | 75 ++++++++++++++++ kernel/riscv64/asum.c | 67 ++++++++++++++ kernel/riscv64/axpby.c | 96 ++++++++++++++++++++ kernel/riscv64/axpy.c | 64 ++++++++++++++ kernel/riscv64/copy.c | 59 +++++++++++++ kernel/riscv64/dot.c | 64 ++++++++++++++ kernel/riscv64/gemv_n.c | 67 ++++++++++++++ kernel/riscv64/gemv_t.c | 68 ++++++++++++++ kernel/riscv64/iamax.c | 77 ++++++++++++++++ kernel/riscv64/iamin.c | 77 ++++++++++++++++ kernel/riscv64/imax.c | 69 +++++++++++++++ kernel/riscv64/imin.c | 67 ++++++++++++++ kernel/riscv64/izamax.c | 81 +++++++++++++++++ kernel/riscv64/izamin.c | 81 +++++++++++++++++ kernel/riscv64/max.c | 65 ++++++++++++++ kernel/riscv64/min.c | 65 ++++++++++++++ kernel/riscv64/nrm2.c | 88 ++++++++++++++++++ kernel/riscv64/omatcopy_cn.c | 90 +++++++++++++++++++ kernel/riscv64/omatcopy_ct.c | 89 +++++++++++++++++++ kernel/riscv64/omatcopy_rn.c | 90 +++++++++++++++++++ kernel/riscv64/omatcopy_rt.c | 62 +++++++++++++ kernel/riscv64/rot.c | 62 +++++++++++++ kernel/riscv64/scal.c | 63 +++++++++++++ kernel/riscv64/swap.c | 62 +++++++++++++ kernel/riscv64/symv_L.c | 70 +++++++++++++++ kernel/riscv64/symv_U.c | 71 +++++++++++++++ kernel/riscv64/zamax.c | 79 +++++++++++++++++ kernel/riscv64/zamin.c | 79 +++++++++++++++++ kernel/riscv64/zasum.c | 72 +++++++++++++++ kernel/riscv64/zaxpby.c | 118 +++++++++++++++++++++++++ kernel/riscv64/zaxpy.c | 74 ++++++++++++++++ kernel/riscv64/zcopy.c | 65 ++++++++++++++ kernel/riscv64/zdot.c | 80 +++++++++++++++++ kernel/riscv64/zgemv_n.c | 157 +++++++++++++++++++++++++++++++++ kernel/riscv64/zgemv_t.c | 140 +++++++++++++++++++++++++++++ kernel/riscv64/znrm2.c | 106 ++++++++++++++++++++++ kernel/riscv64/zomatcopy_cn.c | 70 +++++++++++++++ kernel/riscv64/zomatcopy_cnc.c | 69 +++++++++++++++ kernel/riscv64/zomatcopy_ct.c | 71 +++++++++++++++ kernel/riscv64/zomatcopy_ctc.c | 71 +++++++++++++++ kernel/riscv64/zomatcopy_rn.c | 70 +++++++++++++++ kernel/riscv64/zomatcopy_rnc.c | 69 +++++++++++++++ kernel/riscv64/zomatcopy_rt.c | 72 +++++++++++++++ kernel/riscv64/zomatcopy_rtc.c | 72 +++++++++++++++ kernel/riscv64/zrot.c | 70 +++++++++++++++ kernel/riscv64/zscal.c | 88 ++++++++++++++++++ kernel/riscv64/zswap.c | 72 +++++++++++++++ lapack/laswp/riscv64/Makefile | 13 +++ param.h | 39 ++++++++ 59 files changed, 4166 insertions(+) create mode 100644 Makefile.riscv64 create mode 100644 common_riscv64.h create mode 100644 cpuid_riscv64.c create mode 100644 kernel/riscv64/KERNEL create mode 100644 kernel/riscv64/amax.c create mode 100644 kernel/riscv64/amin.c create mode 100644 kernel/riscv64/asum.c create mode 100644 kernel/riscv64/axpby.c create mode 100644 kernel/riscv64/axpy.c create mode 100644 kernel/riscv64/copy.c create mode 100644 kernel/riscv64/dot.c create mode 100644 kernel/riscv64/gemv_n.c create mode 100644 kernel/riscv64/gemv_t.c create mode 100644 kernel/riscv64/iamax.c create mode 100644 kernel/riscv64/iamin.c create mode 100644 kernel/riscv64/imax.c create mode 100644 kernel/riscv64/imin.c create mode 100644 kernel/riscv64/izamax.c create mode 100644 kernel/riscv64/izamin.c create mode 100644 kernel/riscv64/max.c create mode 100644 kernel/riscv64/min.c create mode 100644 kernel/riscv64/nrm2.c create mode 100644 kernel/riscv64/omatcopy_cn.c create mode 100644 kernel/riscv64/omatcopy_ct.c create mode 100644 kernel/riscv64/omatcopy_rn.c create mode 100644 kernel/riscv64/omatcopy_rt.c create mode 100644 kernel/riscv64/rot.c create mode 100644 kernel/riscv64/scal.c create mode 100644 kernel/riscv64/swap.c create mode 100644 kernel/riscv64/symv_L.c create mode 100644 kernel/riscv64/symv_U.c create mode 100644 kernel/riscv64/zamax.c create mode 100644 kernel/riscv64/zamin.c create mode 100644 kernel/riscv64/zasum.c create mode 100644 kernel/riscv64/zaxpby.c create mode 100644 kernel/riscv64/zaxpy.c create mode 100644 kernel/riscv64/zcopy.c create mode 100644 kernel/riscv64/zdot.c create mode 100644 kernel/riscv64/zgemv_n.c create mode 100644 kernel/riscv64/zgemv_t.c create mode 100644 kernel/riscv64/znrm2.c create mode 100644 kernel/riscv64/zomatcopy_cn.c create mode 100644 kernel/riscv64/zomatcopy_cnc.c create mode 100644 kernel/riscv64/zomatcopy_ct.c create mode 100644 kernel/riscv64/zomatcopy_ctc.c create mode 100644 kernel/riscv64/zomatcopy_rn.c create mode 100644 kernel/riscv64/zomatcopy_rnc.c create mode 100644 kernel/riscv64/zomatcopy_rt.c create mode 100644 kernel/riscv64/zomatcopy_rtc.c create mode 100644 kernel/riscv64/zrot.c create mode 100644 kernel/riscv64/zscal.c create mode 100644 kernel/riscv64/zswap.c create mode 100644 lapack/laswp/riscv64/Makefile diff --git a/Makefile.riscv64 b/Makefile.riscv64 new file mode 100644 index 000000000..e69de29bb diff --git a/Makefile.system b/Makefile.system index 142cb420f..02d392d9c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -593,7 +593,9 @@ endif ifndef BINARY_DEFINED ifneq ($(OSNAME), AIX) ifdef BINARY64 +ifneq ($(ARCH), riscv64) CCOMMON_OPT += -m64 +endif else CCOMMON_OPT += -m32 endif @@ -687,8 +689,10 @@ endif else ifdef BINARY64 ifneq ($(OSNAME), AIX) +ifneq ($(ARCH), riscv64) FCOMMON_OPT += -m64 endif +endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -fdefault-integer-8 diff --git a/c_check b/c_check index a3b337602..c564855f3 100644 --- a/c_check +++ b/c_check @@ -76,6 +76,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $defined = 0; diff --git a/common.h b/common.h index 5a599a5af..3d23d9ee6 100644 --- a/common.h +++ b/common.h @@ -408,6 +408,11 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif + +#ifdef ARCH_RISCV64 +#include "common_riscv64.h" +#endif + #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif diff --git a/common_riscv64.h b/common_riscv64.h new file mode 100644 index 000000000..fe4e0a6d3 --- /dev/null +++ b/common_riscv64.h @@ -0,0 +1,93 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_RISCV64 +#define COMMON_RISCV64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c new file mode 100644 index 000000000..129ed11b0 --- /dev/null +++ b/cpuid_riscv64.c @@ -0,0 +1,111 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define CPU_UNKNOWN 0 + +static char *cpuname[] = { + "UNKOWN", +}; + +int detect(void){ + return CPU_UNKNOWN; +} + +char *get_corename(void){ + return cpuname[detect()]; +} + +void get_architecture(void){ + printf("RISCV64"); +} + +void get_subarchitecture(void){ +} + +void get_subdirname(void){ + printf("riscv64"); +} + +void get_cpuconfig(void){ + printf("#define UNKNOWN\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); +} + +void get_libname(void){ + printf("riscv64\n"); +} diff --git a/ctest.c b/ctest.c index 00be423d1..cab939887 100644 --- a/ctest.c +++ b/ctest.c @@ -149,3 +149,7 @@ ARCH_ARM ARCH_ARM64 #endif +#if defined(__riscv) +ARCH_RISCV64 +#endif + diff --git a/getarch.c b/getarch.c index 992fc2b95..7f7fd97c4 100644 --- a/getarch.c +++ b/getarch.c @@ -604,6 +604,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" @@ -859,6 +860,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64 +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64" +#define CORENAME "RISCV64" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1051,6 +1066,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL new file mode 100644 index 000000000..7d854ced6 --- /dev/null +++ b/kernel/riscv64/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../riscv64/amax.c +DAMAXKERNEL = ../riscv64/amax.c +CAMAXKERNEL = ../riscv64/zamax.c +ZAMAXKERNEL = ../riscv64/zamax.c + +SAMINKERNEL = ../riscv64/amin.c +DAMINKERNEL = ../riscv64/amin.c +CAMINKERNEL = ../riscv64/zamin.c +ZAMINKERNEL = ../riscv64/zamin.c + +SMAXKERNEL = ../riscv64/max.c +DMAXKERNEL = ../riscv64/max.c + +SMINKERNEL = ../riscv64/min.c +DMINKERNEL = ../riscv64/min.c + +ISAMAXKERNEL = ../riscv64/iamax.c +IDAMAXKERNEL = ../riscv64/iamax.c +ICAMAXKERNEL = ../riscv64/izamax.c +IZAMAXKERNEL = ../riscv64/izamax.c + +ISAMINKERNEL = ../riscv64/iamin.c +IDAMINKERNEL = ../riscv64/iamin.c +ICAMINKERNEL = ../riscv64/izamin.c +IZAMINKERNEL = ../riscv64/izamin.c + +ISMAXKERNEL = ../riscv64/imax.c +IDMAXKERNEL = ../riscv64/imax.c + +ISMINKERNEL = ../riscv64/imin.c +IDMINKERNEL = ../riscv64/imin.c + +SASUMKERNEL = ../riscv64/asum.c +DASUMKERNEL = ../riscv64/asum.c +CASUMKERNEL = ../riscv64/zasum.c +ZASUMKERNEL = ../riscv64/zasum.c + +SAXPYKERNEL = ../riscv64/axpy.c +DAXPYKERNEL = ../riscv64/axpy.c +CAXPYKERNEL = ../riscv64/zaxpy.c +ZAXPYKERNEL = ../riscv64/zaxpy.c + +SCOPYKERNEL = ../riscv64/copy.c +DCOPYKERNEL = ../riscv64/copy.c +CCOPYKERNEL = ../riscv64/zcopy.c +ZCOPYKERNEL = ../riscv64/zcopy.c + +SDOTKERNEL = ../riscv64/dot.c +DDOTKERNEL = ../riscv64/dot.c +CDOTKERNEL = ../riscv64/zdot.c +ZDOTKERNEL = ../riscv64/zdot.c + +SNRM2KERNEL = ../riscv64/nrm2.c +DNRM2KERNEL = ../riscv64/nrm2.c +CNRM2KERNEL = ../riscv64/znrm2.c +ZNRM2KERNEL = ../riscv64/znrm2.c + +SROTKERNEL = ../riscv64/rot.c +DROTKERNEL = ../riscv64/rot.c +CROTKERNEL = ../riscv64/zrot.c +ZROTKERNEL = ../riscv64/zrot.c + +SSCALKERNEL = ../riscv64/scal.c +DSCALKERNEL = ../riscv64/scal.c +CSCALKERNEL = ../riscv64/zscal.c +ZSCALKERNEL = ../riscv64/zscal.c + +SSWAPKERNEL = ../riscv64/swap.c +DSWAPKERNEL = ../riscv64/swap.c +CSWAPKERNEL = ../riscv64/zswap.c +ZSWAPKERNEL = ../riscv64/zswap.c + +SGEMVNKERNEL = ../riscv64/gemv_n.c +DGEMVNKERNEL = ../riscv64/gemv_n.c +CGEMVNKERNEL = ../riscv64/zgemv_n.c +ZGEMVNKERNEL = ../riscv64/zgemv_n.c + +SGEMVTKERNEL = ../riscv64/gemv_t.c +DGEMVTKERNEL = ../riscv64/gemv_t.c +CGEMVTKERNEL = ../riscv64/zgemv_t.c +ZGEMVTKERNEL = ../riscv64/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/kernel/riscv64/amax.c b/kernel/riscv64/amax.c new file mode 100644 index 000000000..792e68bd9 --- /dev/null +++ b/kernel/riscv64/amax.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c new file mode 100644 index 000000000..78495a8e3 --- /dev/null +++ b/kernel/riscv64/amin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/asum.c b/kernel/riscv64/asum.c new file mode 100644 index 000000000..b284ae3fc --- /dev/null +++ b/kernel/riscv64/asum.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c new file mode 100644 index 000000000..278747f75 --- /dev/null +++ b/kernel/riscv64/axpby.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + + ix = 0; + iy = 0; + + if ( beta == 0.0 ) + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = 0.0 ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + + + } + + } + else + { + + if ( alpha == 0.0 ) + { + while(i < n) + { + y[iy] = beta * y[iy] ; + iy += inc_y ; + i++ ; + } + } + else + { + while(i < n) + { + y[iy] = alpha * x[ix] + beta * y[iy] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + } + } + + } + + return(0); + +} + + diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c new file mode 100644 index 000000000..fb1094dd9 --- /dev/null +++ b/kernel/riscv64/axpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c new file mode 100644 index 000000000..7b4f04f30 --- /dev/null +++ b/kernel/riscv64/copy.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c new file mode 100644 index 000000000..46a84ad18 --- /dev/null +++ b/kernel/riscv64/dot.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c new file mode 100644 index 000000000..ef61b245b --- /dev/null +++ b/kernel/riscv64/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c new file mode 100644 index 000000000..155292bd5 --- /dev/null +++ b/kernel/riscv64/iamin.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c new file mode 100644 index 000000000..5072dd16e --- /dev/null +++ b/kernel/riscv64/imax.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c new file mode 100644 index 000000000..598cba387 --- /dev/null +++ b/kernel/riscv64/imin.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c new file mode 100644 index 000000000..8fe33e95b --- /dev/null +++ b/kernel/riscv64/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c new file mode 100644 index 000000000..fb5a0d4cb --- /dev/null +++ b/kernel/riscv64/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c new file mode 100644 index 000000000..2ad956bc0 --- /dev/null +++ b/kernel/riscv64/max.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c new file mode 100644 index 000000000..2812fe397 --- /dev/null +++ b/kernel/riscv64/min.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c new file mode 100644 index 000000000..fcff09337 --- /dev/null +++ b/kernel/riscv64/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n <= 0 || inc_x <= 0) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/omatcopy_cn.c b/kernel/riscv64/omatcopy_cn.c new file mode 100644 index 000000000..4d11b9125 --- /dev/null +++ b/kernel/riscv64/omatcopy_cn.c @@ -0,0 +1,90 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +/***************************************************** + * 2014/06/09 Saar + * + * Order ColMajor + * No Trans + * +******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/symv_L.c b/kernel/riscv64/symv_L.c new file mode 100644 index 000000000..8f48d03f5 --- /dev/null +++ b/kernel/riscv64/symv_L.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if ( m != offset ) + printf("Symv_L: m=%d offset=%d\n",m,offset); +#endif + + jx = 0; + jy = 0; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c new file mode 100644 index 000000000..02eab3e75 --- /dev/null +++ b/kernel/riscv64/zamin.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/zasum.c b/kernel/riscv64/zasum.c new file mode 100644 index 000000000..61e85cae6 --- /dev/null +++ b/kernel/riscv64/zasum.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/riscv64/zaxpby.c b/kernel/riscv64/zaxpby.c new file mode 100644 index 000000000..445354416 --- /dev/null +++ b/kernel/riscv64/zaxpby.c @@ -0,0 +1,118 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/*************************************************************************** +* 2014/06/07 Saar +* +***************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix,iy; + FLOAT temp; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ( beta_r == 0.0 && beta_i == 0.0) + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + y[iy] = 0.0 ; + y[iy+1] = 0.0 ; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + y[iy] = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + } + else + { + if ( alpha_r == 0.0 && alpha_i == 0.0 ) + { + + while(i < n) + { + temp = ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + iy += inc_y2 ; + i++ ; + } + + } + else + { + + while(i < n) + { + temp = ( alpha_r * x[ix] - alpha_i * x[ix+1] ) + ( beta_r * y[iy] - beta_i * y[iy+1] ) ; + y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix] ) + ( beta_r * y[iy+1] + beta_i * y[iy] ) ; + y[iy] = temp; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + + } + + + + } + return(0); + +} + + diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c new file mode 100644 index 000000000..1dcaeac27 --- /dev/null +++ b/kernel/riscv64/zaxpy.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/15 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c new file mode 100644 index 000000000..07fe584c5 --- /dev/null +++ b/kernel/riscv64/zcopy.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/zdot.c b/kernel/riscv64/zdot.c new file mode 100644 index 000000000..733c235c6 --- /dev/null +++ b/kernel/riscv64/zdot.c @@ -0,0 +1,80 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : FAIL +* BLASTEST double : FAIL +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} + + diff --git a/kernel/riscv64/zgemv_n.c b/kernel/riscv64/zgemv_n.c new file mode 100644 index 000000000..b9b03f792 --- /dev/null +++ b/kernel/riscv64/zgemv_n.c @@ -0,0 +1,157 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** + * * 2013/11/23 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/zomatcopy_cn.c b/kernel/riscv64/zomatcopy_cn.c new file mode 100644 index 000000000..f5a7a6284 --- /dev/null +++ b/kernel/riscv64/zomatcopy_cn.c @@ -0,0 +1,70 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +/***************************************************** + * 2014/06/09 Saar + * + * Order ColMajor + * No Trans + * +******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j,ia; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + lda *= 2; + ldb *= 2; + + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/riscv64/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index 189cdc4a0..52675bc25 100644 --- a/param.h +++ b/param.h @@ -2343,6 +2343,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#ifdef RISCV64 +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef ARMV7 #define SNUMOPT 2 #define DNUMOPT 2 From 0ee395db35ee824aff77d4d2b812aaedb111addd Mon Sep 17 00:00:00 2001 From: Jerry Zhao Date: Wed, 18 Apr 2018 18:03:32 -0700 Subject: [PATCH 002/121] Fixed TRMM and SYMM for RISCV --- kernel/Makefile.L3 | 4 ++++ kernel/riscv64/KERNEL | 10 ++++++++++ param.h | 8 ++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 4284fbfa0..63e09a56d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -20,6 +20,10 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif +ifeq ($(ARCH), riscv64) +USE_TRMM = 1 +endif + ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL index 7d854ced6..04d82b4ce 100644 --- a/kernel/riscv64/KERNEL +++ b/kernel/riscv64/KERNEL @@ -129,6 +129,16 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + LSAME_KERNEL = ../generic/lsame.c SCABS_KERNEL = ../generic/cabs.c diff --git a/param.h b/param.h index 52675bc25..22d837960 100644 --- a/param.h +++ b/param.h @@ -2348,11 +2348,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 From db17ce896fbbf53cbef34f81e1f1ec6887965ec4 Mon Sep 17 00:00:00 2001 From: Dumi Loghin Date: Wed, 5 Sep 2018 12:49:37 +0800 Subject: [PATCH 003/121] replace ARCH with AR in lapack-netlib --- Makefile | 4 +-- c_check | 4 +++ lapack-netlib/BLAS/SRC/Makefile | 10 +++--- lapack-netlib/CBLAS/src/Makefile | 32 +++++++++---------- lapack-netlib/DOCS/lawn81.tex | 2 +- lapack-netlib/INSTALL/make.inc.ALPHA | 4 +-- lapack-netlib/INSTALL/make.inc.HPPA | 4 +-- lapack-netlib/INSTALL/make.inc.IRIX64 | 4 +-- lapack-netlib/INSTALL/make.inc.O2K | 4 +-- lapack-netlib/INSTALL/make.inc.SGI5 | 4 +-- lapack-netlib/INSTALL/make.inc.SUN4 | 4 +-- lapack-netlib/INSTALL/make.inc.SUN4SOL2 | 4 +-- lapack-netlib/INSTALL/make.inc.XLF | 4 +-- lapack-netlib/INSTALL/make.inc.gfortran | 4 +-- lapack-netlib/INSTALL/make.inc.gfortran_debug | 4 +-- lapack-netlib/INSTALL/make.inc.ifort | 4 +-- lapack-netlib/INSTALL/make.inc.pgf95 | 4 +-- lapack-netlib/INSTALL/make.inc.pghpf | 4 +-- lapack-netlib/LAPACKE/src/Makefile | 10 +++--- lapack-netlib/LAPACKE/utils/Makefile | 2 +- lapack-netlib/SRC/Makefile | 10 +++--- lapack-netlib/SRC/VARIANTS/Makefile | 12 +++---- lapack-netlib/TESTING/MATGEN/Makefile | 10 +++--- lapack-netlib/make.inc.example | 4 +-- make.inc | 2 +- 25 files changed, 79 insertions(+), 75 deletions(-) diff --git a/Makefile b/Makefile index c0e5fbcf8..aaeb0c498 100644 --- a/Makefile +++ b/Makefile @@ -237,8 +237,8 @@ ifndef NOFORTRAN -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/c_check b/c_check index c564855f3..f86a37b5b 100644 --- a/c_check +++ b/c_check @@ -121,6 +121,10 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $binary =32; } +if ($architecture eq "riscv64") { + $defined = 1; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile index a436365aa..f7236318b 100644 --- a/lapack-netlib/BLAS/SRC/Makefile +++ b/lapack-netlib/BLAS/SRC/Makefile @@ -138,23 +138,23 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \ $(ZBLAS2) $(ZBLAS3) $(ALLBLAS) $(BLASLIB): $(ALLOBJ) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) - $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ + $(AR) $(ARFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) FRC: diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile index 6c0518ac7..9b9063d8d 100644 --- a/lapack-netlib/CBLAS/src/Makefile +++ b/lapack-netlib/CBLAS/src/Makefile @@ -45,22 +45,22 @@ sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o # Single precision real slib1: $(slev1) $(sclev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib1: $(dlev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib1: $(clev1) $(sclev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib1: $(zlev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # @@ -97,22 +97,22 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \ # Single precision real slib2: $(slev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib2: $(dlev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib2: $(clev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib2: $(zlev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # @@ -143,22 +143,22 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \ # Single precision real slib3: $(slev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib3: $(dlev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib3: $(clev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib3: $(zlev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) @@ -168,22 +168,22 @@ alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3) # All level 1 all1: $(alev1) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All level 2 all2: $(alev2) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All level 3 all3: $(alev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ + $(AR) $(ARFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All levels and precisions $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ FRC: diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 291735299..01c7c39e2 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -466,7 +466,7 @@ TIMER = EXT_ETIME Refer to the section~\ref{second} to get more information. -Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver, +Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver, archiver options, and ranlib for your machine. If your architecture does not require \texttt{ranlib} to be run after each archive command (as is the case with CRAY computers running UNICOS, Hewlett Packard diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA index 0ceeaa155..049cf0b13 100644 --- a/lapack-netlib/INSTALL/make.inc.ALPHA +++ b/lapack-netlib/INSTALL/make.inc.ALPHA @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA index 8eabbbdf4..2bd8ee16e 100644 --- a/lapack-netlib/INSTALL/make.inc.HPPA +++ b/lapack-netlib/INSTALL/make.inc.HPPA @@ -29,8 +29,8 @@ LOADOPTS = -Aa +U77 # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64 index d9e71e1bf..0f57941b5 100644 --- a/lapack-netlib/INSTALL/make.inc.IRIX64 +++ b/lapack-netlib/INSTALL/make.inc.IRIX64 @@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K index 3ffcadacc..d99beca41 100644 --- a/lapack-netlib/INSTALL/make.inc.O2K +++ b/lapack-netlib/INSTALL/make.inc.O2K @@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000 # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5 index c7019ac16..c4a702d48 100644 --- a/lapack-netlib/INSTALL/make.inc.SGI5 +++ b/lapack-netlib/INSTALL/make.inc.SGI5 @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4 index 4e44f1beb..6a78e9576 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4 +++ b/lapack-netlib/INSTALL/make.inc.SUN4 @@ -29,8 +29,8 @@ LOADOPTS = -dalign -O4 -fast # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 index e6d79add3..0ac3cc4e4 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 +++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 @@ -33,8 +33,8 @@ LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF index 9466ee332..27e22cac9 100644 --- a/lapack-netlib/INSTALL/make.inc.XLF +++ b/lapack-netlib/INSTALL/make.inc.XLF @@ -30,8 +30,8 @@ LOADOPTS = -qnosave # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran index 39d98d4d4..b342b18a8 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran +++ b/lapack-netlib/INSTALL/make.inc.gfortran @@ -33,8 +33,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug index 10e6381df..1eaed2102 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran_debug +++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug @@ -33,8 +33,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort index b067bd484..a3c37428e 100644 --- a/lapack-netlib/INSTALL/make.inc.ifort +++ b/lapack-netlib/INSTALL/make.inc.ifort @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95 index a9a5cec98..931ff378f 100644 --- a/lapack-netlib/INSTALL/make.inc.pgf95 +++ b/lapack-netlib/INSTALL/make.inc.pgf95 @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf index 1d9bf549c..0dfe8c683 100644 --- a/lapack-netlib/INSTALL/make.inc.pghpf +++ b/lapack-netlib/INSTALL/make.inc.pghpf @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 44884d4a5..03c140bf7 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2455,16 +2455,16 @@ endif all: ../../$(LAPACKELIB) ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN) - $(ARCH) $(ARCHFLAGS) $@ $(OBJ_A) - $(ARCH) $(ARCHFLAGS) $@ $(OBJ_B) + $(AR) $(ARFLAGS) $@ $(OBJ_A) + $(AR) $(ARFLAGS) $@ $(OBJ_B) ifdef BUILD_DEPRECATED - $(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED) + $(AR) $(ARFLAGS) $@ $(DEPRECATED) endif ifdef (USEXBLAS) - $(ARCH) $(ARCHFLAGS) $@ $(EXTENDED) + $(AR) $(ARFLAGS) $@ $(EXTENDED) endif ifdef LAPACKE_WITH_TMG - $(ARCH) $(ARCHFLAGS) $@ $(MATGEN) + $(AR) $(ARFLAGS) $@ $(MATGEN) endif $(RANLIB) $@ diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index 1f639c6ea..c6204ee3b 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -186,7 +186,7 @@ OBJ = lapacke_cgb_nancheck.o \ all: lib lib: $(OBJ) - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $^ + $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $^ $(RANLIB) ../../$(LAPACKELIB) clean: cleanobj diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 531cb51fc..e5bb7a3db 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -553,26 +553,26 @@ endif all: ../$(LAPACKLIB) ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) - $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) + $(AR) $(ARFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ $(SXLASRC) $(SCLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ $(CXLASRC) $(SCLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ $(DXLASRC) $(DZLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ + $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ $(ZXLASRC) $(DZLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 9f1410755..7d0e8824c 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -33,27 +33,27 @@ QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a cholrl.a: $(CHOLRL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ choltop.a: $(CHOLTOP) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lucr.a: $(LUCR) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lull.a: $(LULL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ lurec.a: $(LUREC) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ qrll.a: $(QRLL) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ clean: cleanobj cleanlib diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e20004c2f..f5ea5a8c0 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -58,23 +58,23 @@ ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) ../../$(TMGLIB): $(ALLOBJ) - $(ARCH) $(ARCHFLAGS) $@ $^ + $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ single: $(SMATGEN) $(SCATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ + $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) complex: $(CMATGEN) $(SCATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ + $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) double: $(DMATGEN) $(DZATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ + $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) complex16: $(ZMATGEN) $(DZATGEN) - $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ + $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) $(SCATGEN): $(FRC) diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example index d780c3a23..3ddb9eafc 100644 --- a/lapack-netlib/make.inc.example +++ b/lapack-netlib/make.inc.example @@ -33,8 +33,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -ARCH = ar -ARCHFLAGS = cr +AR = ar +ARFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/make.inc b/make.inc index b6ed098c0..93b355103 100644 --- a/make.inc +++ b/make.inc @@ -1,6 +1,6 @@ SHELL = /bin/sh PLAT = _LINUX DRVOPTS = $(NOOPT) -#ARCHFLAGS= $(ARFLAGS) -ru +#ARFLAGS= $(ARFLAGS) -ru #RANLIB = ranlib From 0b7ccb9e381d4bc3d0149c158631389c2c2d411c Mon Sep 17 00:00:00 2001 From: Dumi Loghin Date: Thu, 6 Sep 2018 13:08:30 +0800 Subject: [PATCH 004/121] Revert "replace ARCH with AR in lapack-netlib" This reverts commit db17ce896fbbf53cbef34f81e1f1ec6887965ec4. --- Makefile | 4 +-- c_check | 4 --- lapack-netlib/BLAS/SRC/Makefile | 10 +++--- lapack-netlib/CBLAS/src/Makefile | 32 +++++++++---------- lapack-netlib/DOCS/lawn81.tex | 2 +- lapack-netlib/INSTALL/make.inc.ALPHA | 4 +-- lapack-netlib/INSTALL/make.inc.HPPA | 4 +-- lapack-netlib/INSTALL/make.inc.IRIX64 | 4 +-- lapack-netlib/INSTALL/make.inc.O2K | 4 +-- lapack-netlib/INSTALL/make.inc.SGI5 | 4 +-- lapack-netlib/INSTALL/make.inc.SUN4 | 4 +-- lapack-netlib/INSTALL/make.inc.SUN4SOL2 | 4 +-- lapack-netlib/INSTALL/make.inc.XLF | 4 +-- lapack-netlib/INSTALL/make.inc.gfortran | 4 +-- lapack-netlib/INSTALL/make.inc.gfortran_debug | 4 +-- lapack-netlib/INSTALL/make.inc.ifort | 4 +-- lapack-netlib/INSTALL/make.inc.pgf95 | 4 +-- lapack-netlib/INSTALL/make.inc.pghpf | 4 +-- lapack-netlib/LAPACKE/src/Makefile | 10 +++--- lapack-netlib/LAPACKE/utils/Makefile | 2 +- lapack-netlib/SRC/Makefile | 10 +++--- lapack-netlib/SRC/VARIANTS/Makefile | 12 +++---- lapack-netlib/TESTING/MATGEN/Makefile | 10 +++--- lapack-netlib/make.inc.example | 4 +-- make.inc | 2 +- 25 files changed, 75 insertions(+), 79 deletions(-) diff --git a/Makefile b/Makefile index aaeb0c498..c0e5fbcf8 100644 --- a/Makefile +++ b/Makefile @@ -237,8 +237,8 @@ ifndef NOFORTRAN -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/c_check b/c_check index f86a37b5b..c564855f3 100644 --- a/c_check +++ b/c_check @@ -121,10 +121,6 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $binary =32; } -if ($architecture eq "riscv64") { - $defined = 1; -} - if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile index f7236318b..a436365aa 100644 --- a/lapack-netlib/BLAS/SRC/Makefile +++ b/lapack-netlib/BLAS/SRC/Makefile @@ -138,23 +138,23 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \ $(ZBLAS2) $(ZBLAS3) $(ALLBLAS) $(BLASLIB): $(ALLOBJ) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3) - $(AR) $(ARFLAGS) $(BLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(BLASLIB) $^ $(RANLIB) $(BLASLIB) FRC: diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile index 9b9063d8d..6c0518ac7 100644 --- a/lapack-netlib/CBLAS/src/Makefile +++ b/lapack-netlib/CBLAS/src/Makefile @@ -45,22 +45,22 @@ sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o # Single precision real slib1: $(slev1) $(sclev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib1: $(dlev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib1: $(clev1) $(sclev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib1: $(zlev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # @@ -97,22 +97,22 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \ # Single precision real slib2: $(slev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib2: $(dlev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib2: $(clev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib2: $(zlev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # @@ -143,22 +143,22 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \ # Single precision real slib3: $(slev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision real dlib3: $(dlev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Single precision complex clib3: $(clev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # Double precision complex zlib3: $(zlev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) @@ -168,22 +168,22 @@ alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3) # All level 1 all1: $(alev1) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All level 2 all2: $(alev2) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All level 3 all3: $(alev3) $(errhand) - $(AR) $(ARFLAGS) $(CBLASLIB) $^ + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^ $(RANLIB) $(CBLASLIB) # All levels and precisions $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ FRC: diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex index 01c7c39e2..291735299 100644 --- a/lapack-netlib/DOCS/lawn81.tex +++ b/lapack-netlib/DOCS/lawn81.tex @@ -466,7 +466,7 @@ TIMER = EXT_ETIME Refer to the section~\ref{second} to get more information. -Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver, +Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver, archiver options, and ranlib for your machine. If your architecture does not require \texttt{ranlib} to be run after each archive command (as is the case with CRAY computers running UNICOS, Hewlett Packard diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA index 049cf0b13..0ceeaa155 100644 --- a/lapack-netlib/INSTALL/make.inc.ALPHA +++ b/lapack-netlib/INSTALL/make.inc.ALPHA @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA index 2bd8ee16e..8eabbbdf4 100644 --- a/lapack-netlib/INSTALL/make.inc.HPPA +++ b/lapack-netlib/INSTALL/make.inc.HPPA @@ -29,8 +29,8 @@ LOADOPTS = -Aa +U77 # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64 index 0f57941b5..d9e71e1bf 100644 --- a/lapack-netlib/INSTALL/make.inc.IRIX64 +++ b/lapack-netlib/INSTALL/make.inc.IRIX64 @@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K index d99beca41..3ffcadacc 100644 --- a/lapack-netlib/INSTALL/make.inc.O2K +++ b/lapack-netlib/INSTALL/make.inc.O2K @@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000 # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5 index c4a702d48..c7019ac16 100644 --- a/lapack-netlib/INSTALL/make.inc.SGI5 +++ b/lapack-netlib/INSTALL/make.inc.SGI5 @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4 index 6a78e9576..4e44f1beb 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4 +++ b/lapack-netlib/INSTALL/make.inc.SUN4 @@ -29,8 +29,8 @@ LOADOPTS = -dalign -O4 -fast # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 index 0ac3cc4e4..e6d79add3 100644 --- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 +++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2 @@ -33,8 +33,8 @@ LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF index 27e22cac9..9466ee332 100644 --- a/lapack-netlib/INSTALL/make.inc.XLF +++ b/lapack-netlib/INSTALL/make.inc.XLF @@ -30,8 +30,8 @@ LOADOPTS = -qnosave # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran index b342b18a8..39d98d4d4 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran +++ b/lapack-netlib/INSTALL/make.inc.gfortran @@ -33,8 +33,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug index 1eaed2102..10e6381df 100644 --- a/lapack-netlib/INSTALL/make.inc.gfortran_debug +++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug @@ -33,8 +33,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort index a3c37428e..b067bd484 100644 --- a/lapack-netlib/INSTALL/make.inc.ifort +++ b/lapack-netlib/INSTALL/make.inc.ifort @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95 index 931ff378f..a9a5cec98 100644 --- a/lapack-netlib/INSTALL/make.inc.pgf95 +++ b/lapack-netlib/INSTALL/make.inc.pgf95 @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf index 0dfe8c683..1d9bf549c 100644 --- a/lapack-netlib/INSTALL/make.inc.pghpf +++ b/lapack-netlib/INSTALL/make.inc.pghpf @@ -29,8 +29,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = echo # Timer for the SECOND and DSECND routines diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 03c140bf7..44884d4a5 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -2455,16 +2455,16 @@ endif all: ../../$(LAPACKELIB) ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN) - $(AR) $(ARFLAGS) $@ $(OBJ_A) - $(AR) $(ARFLAGS) $@ $(OBJ_B) + $(ARCH) $(ARCHFLAGS) $@ $(OBJ_A) + $(ARCH) $(ARCHFLAGS) $@ $(OBJ_B) ifdef BUILD_DEPRECATED - $(AR) $(ARFLAGS) $@ $(DEPRECATED) + $(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED) endif ifdef (USEXBLAS) - $(AR) $(ARFLAGS) $@ $(EXTENDED) + $(ARCH) $(ARCHFLAGS) $@ $(EXTENDED) endif ifdef LAPACKE_WITH_TMG - $(AR) $(ARFLAGS) $@ $(MATGEN) + $(ARCH) $(ARCHFLAGS) $@ $(MATGEN) endif $(RANLIB) $@ diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index c6204ee3b..1f639c6ea 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -186,7 +186,7 @@ OBJ = lapacke_cgb_nancheck.o \ all: lib lib: $(OBJ) - $(AR) $(ARFLAGS) ../../$(LAPACKELIB) $^ + $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $^ $(RANLIB) ../../$(LAPACKELIB) clean: cleanobj diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index e5bb7a3db..531cb51fc 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -553,26 +553,26 @@ endif all: ../$(LAPACKLIB) ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) - $(AR) $(ARFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED) $(RANLIB) $@ single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ $(SXLASRC) $(SCLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \ $(CXLASRC) $(SCLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \ $(DXLASRC) $(DZLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX) - $(AR) $(ARFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \ $(ZXLASRC) $(DZLAUX) $(ALLAUX) $(RANLIB) ../$(LAPACKLIB) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 7d0e8824c..9f1410755 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -33,27 +33,27 @@ QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a cholrl.a: $(CHOLRL) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ choltop.a: $(CHOLTOP) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ lucr.a: $(LUCR) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ lull.a: $(LULL) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ lurec.a: $(LUREC) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ qrll.a: $(QRLL) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ clean: cleanobj cleanlib diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index f5ea5a8c0..e20004c2f 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -58,23 +58,23 @@ ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ $(DZATGEN) ../../$(TMGLIB): $(ALLOBJ) - $(AR) $(ARFLAGS) $@ $^ + $(ARCH) $(ARCHFLAGS) $@ $^ $(RANLIB) $@ single: $(SMATGEN) $(SCATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) complex: $(CMATGEN) $(SCATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) double: $(DMATGEN) $(DZATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) complex16: $(ZMATGEN) $(DZATGEN) - $(AR) $(ARFLAGS) ../../$(TMGLIB) $^ + $(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^ $(RANLIB) ../../$(TMGLIB) $(SCATGEN): $(FRC) diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example index 3ddb9eafc..d780c3a23 100644 --- a/lapack-netlib/make.inc.example +++ b/lapack-netlib/make.inc.example @@ -33,8 +33,8 @@ LOADOPTS = # The archiver and the flag(s) to use when building an archive # (library). If your system has no ranlib, set RANLIB = echo. # -AR = ar -ARFLAGS = cr +ARCH = ar +ARCHFLAGS = cr RANLIB = ranlib # Timer for the SECOND and DSECND routines diff --git a/make.inc b/make.inc index 93b355103..b6ed098c0 100644 --- a/make.inc +++ b/make.inc @@ -1,6 +1,6 @@ SHELL = /bin/sh PLAT = _LINUX DRVOPTS = $(NOOPT) -#ARFLAGS= $(ARFLAGS) -ru +#ARCHFLAGS= $(ARFLAGS) -ru #RANLIB = ranlib From a1bdc308b8d4dcb924f339ca5018c12a455d2652 Mon Sep 17 00:00:00 2001 From: Dumi Loghin Date: Thu, 6 Sep 2018 13:13:36 +0800 Subject: [PATCH 005/121] override ARCH (archiver) in lapack-netlib/make.inc --- Makefile | 2 +- c_check | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c0e5fbcf8..547feb0d2 100644 --- a/Makefile +++ b/Makefile @@ -237,7 +237,7 @@ ifndef NOFORTRAN -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/c_check b/c_check index c564855f3..eb302b71a 100644 --- a/c_check +++ b/c_check @@ -121,6 +121,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $binary =32; } +if ($architecture eq "riscv64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); From 44020a42a453579740fd16cd23e76f4267c41b65 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 27 Feb 2020 14:29:42 +0800 Subject: [PATCH 006/121] Fixed compile bug for RV64. --- kernel/riscv64/KERNEL | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL index 04d82b4ce..ea6a8cf21 100644 --- a/kernel/riscv64/KERNEL +++ b/kernel/riscv64/KERNEL @@ -35,6 +35,11 @@ DASUMKERNEL = ../riscv64/asum.c CASUMKERNEL = ../riscv64/zasum.c ZASUMKERNEL = ../riscv64/zasum.c +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + SAXPYKERNEL = ../riscv64/axpy.c DAXPYKERNEL = ../riscv64/axpy.c CAXPYKERNEL = ../riscv64/zaxpy.c From 265ab484c89d10dfff2d5df678221918d7a880e3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 27 Feb 2020 14:46:15 +0800 Subject: [PATCH 007/121] Change default RISC-V 64-bit corename to RISCV64_GENERIC e.g. make CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran TARGET=RISCV64_GENERIC HOSTCC=gcc --- TargetList.txt | 3 + getarch.c | 10 +- kernel/riscv64/KERNEL | 162 +++---------------------- kernel/riscv64/KERNEL.RISCV64_GENERIC | 164 ++++++++++++++++++++++++++ param.h | 2 +- 5 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 kernel/riscv64/KERNEL.RISCV64_GENERIC diff --git a/TargetList.txt b/TargetList.txt index 6a57bf1af..3b018e17a 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -97,3 +97,6 @@ TSV110 ZARCH_GENERIC Z13 Z14 + +10.RISC-V 64: +RISCV64_GENERIC diff --git a/getarch.c b/getarch.c index d0d260577..58706c452 100644 --- a/getarch.c +++ b/getarch.c @@ -906,17 +906,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_RISCV64 +#ifdef FORCE_RISCV64_GENERIC #define FORCE #define ARCHITECTURE "RISCV64" -#define SUBARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" #define SUBDIRNAME "riscv64" -#define ARCHCONFIG "-DRISCV64 " \ +#define ARCHCONFIG "-DRISCV64_GENERIC " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "riscv64" -#define CORENAME "RISCV64" +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" #else #endif diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL index ea6a8cf21..68d68b5f8 100644 --- a/kernel/riscv64/KERNEL +++ b/kernel/riscv64/KERNEL @@ -1,154 +1,18 @@ -SAMAXKERNEL = ../riscv64/amax.c -DAMAXKERNEL = ../riscv64/amax.c -CAMAXKERNEL = ../riscv64/zamax.c -ZAMAXKERNEL = ../riscv64/zamax.c - -SAMINKERNEL = ../riscv64/amin.c -DAMINKERNEL = ../riscv64/amin.c -CAMINKERNEL = ../riscv64/zamin.c -ZAMINKERNEL = ../riscv64/zamin.c - -SMAXKERNEL = ../riscv64/max.c -DMAXKERNEL = ../riscv64/max.c - -SMINKERNEL = ../riscv64/min.c -DMINKERNEL = ../riscv64/min.c - -ISAMAXKERNEL = ../riscv64/iamax.c -IDAMAXKERNEL = ../riscv64/iamax.c -ICAMAXKERNEL = ../riscv64/izamax.c -IZAMAXKERNEL = ../riscv64/izamax.c - -ISAMINKERNEL = ../riscv64/iamin.c -IDAMINKERNEL = ../riscv64/iamin.c -ICAMINKERNEL = ../riscv64/izamin.c -IZAMINKERNEL = ../riscv64/izamin.c - -ISMAXKERNEL = ../riscv64/imax.c -IDMAXKERNEL = ../riscv64/imax.c - -ISMINKERNEL = ../riscv64/imin.c -IDMINKERNEL = ../riscv64/imin.c - -SASUMKERNEL = ../riscv64/asum.c -DASUMKERNEL = ../riscv64/asum.c -CASUMKERNEL = ../riscv64/zasum.c -ZASUMKERNEL = ../riscv64/zasum.c - -SSUMKERNEL = ../arm/sum.c -DSUMKERNEL = ../arm/sum.c -CSUMKERNEL = ../arm/zsum.c -ZSUMKERNEL = ../arm/zsum.c - -SAXPYKERNEL = ../riscv64/axpy.c -DAXPYKERNEL = ../riscv64/axpy.c -CAXPYKERNEL = ../riscv64/zaxpy.c -ZAXPYKERNEL = ../riscv64/zaxpy.c - -SCOPYKERNEL = ../riscv64/copy.c -DCOPYKERNEL = ../riscv64/copy.c -CCOPYKERNEL = ../riscv64/zcopy.c -ZCOPYKERNEL = ../riscv64/zcopy.c - -SDOTKERNEL = ../riscv64/dot.c -DDOTKERNEL = ../riscv64/dot.c -CDOTKERNEL = ../riscv64/zdot.c -ZDOTKERNEL = ../riscv64/zdot.c - -SNRM2KERNEL = ../riscv64/nrm2.c -DNRM2KERNEL = ../riscv64/nrm2.c -CNRM2KERNEL = ../riscv64/znrm2.c -ZNRM2KERNEL = ../riscv64/znrm2.c - -SROTKERNEL = ../riscv64/rot.c -DROTKERNEL = ../riscv64/rot.c -CROTKERNEL = ../riscv64/zrot.c -ZROTKERNEL = ../riscv64/zrot.c - -SSCALKERNEL = ../riscv64/scal.c -DSCALKERNEL = ../riscv64/scal.c -CSCALKERNEL = ../riscv64/zscal.c -ZSCALKERNEL = ../riscv64/zscal.c - -SSWAPKERNEL = ../riscv64/swap.c -DSWAPKERNEL = ../riscv64/swap.c -CSWAPKERNEL = ../riscv64/zswap.c -ZSWAPKERNEL = ../riscv64/zswap.c - -SGEMVNKERNEL = ../riscv64/gemv_n.c -DGEMVNKERNEL = ../riscv64/gemv_n.c -CGEMVNKERNEL = ../riscv64/zgemv_n.c -ZGEMVNKERNEL = ../riscv64/zgemv_n.c - -SGEMVTKERNEL = ../riscv64/gemv_t.c -DGEMVTKERNEL = ../riscv64/gemv_t.c -CGEMVTKERNEL = ../riscv64/zgemv_t.c -ZGEMVTKERNEL = ../riscv64/zgemv_t.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SSYMV_U_KERNEL = ../generic/symv_k.c -SSYMV_L_KERNEL = ../generic/symv_k.c -DSYMV_U_KERNEL = ../generic/symv_k.c -DSYMV_L_KERNEL = ../generic/symv_k.c -CSYMV_U_KERNEL = ../generic/zsymv_k.c -CSYMV_L_KERNEL = ../generic/zsymv_k.c -ZSYMV_U_KERNEL = ../generic/zsymv_k.c -ZSYMV_L_KERNEL = ../generic/zsymv_k.c - - -LSAME_KERNEL = ../generic/lsame.c - +ifndef SCABS_KERNEL SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif ifndef SGEMM_BETA SGEMM_BETA = ../generic/gemm_beta.c @@ -162,3 +26,5 @@ endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif + + diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC new file mode 100644 index 000000000..ea6a8cf21 --- /dev/null +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -0,0 +1,164 @@ +SAMAXKERNEL = ../riscv64/amax.c +DAMAXKERNEL = ../riscv64/amax.c +CAMAXKERNEL = ../riscv64/zamax.c +ZAMAXKERNEL = ../riscv64/zamax.c + +SAMINKERNEL = ../riscv64/amin.c +DAMINKERNEL = ../riscv64/amin.c +CAMINKERNEL = ../riscv64/zamin.c +ZAMINKERNEL = ../riscv64/zamin.c + +SMAXKERNEL = ../riscv64/max.c +DMAXKERNEL = ../riscv64/max.c + +SMINKERNEL = ../riscv64/min.c +DMINKERNEL = ../riscv64/min.c + +ISAMAXKERNEL = ../riscv64/iamax.c +IDAMAXKERNEL = ../riscv64/iamax.c +ICAMAXKERNEL = ../riscv64/izamax.c +IZAMAXKERNEL = ../riscv64/izamax.c + +ISAMINKERNEL = ../riscv64/iamin.c +IDAMINKERNEL = ../riscv64/iamin.c +ICAMINKERNEL = ../riscv64/izamin.c +IZAMINKERNEL = ../riscv64/izamin.c + +ISMAXKERNEL = ../riscv64/imax.c +IDMAXKERNEL = ../riscv64/imax.c + +ISMINKERNEL = ../riscv64/imin.c +IDMINKERNEL = ../riscv64/imin.c + +SASUMKERNEL = ../riscv64/asum.c +DASUMKERNEL = ../riscv64/asum.c +CASUMKERNEL = ../riscv64/zasum.c +ZASUMKERNEL = ../riscv64/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../riscv64/axpy.c +DAXPYKERNEL = ../riscv64/axpy.c +CAXPYKERNEL = ../riscv64/zaxpy.c +ZAXPYKERNEL = ../riscv64/zaxpy.c + +SCOPYKERNEL = ../riscv64/copy.c +DCOPYKERNEL = ../riscv64/copy.c +CCOPYKERNEL = ../riscv64/zcopy.c +ZCOPYKERNEL = ../riscv64/zcopy.c + +SDOTKERNEL = ../riscv64/dot.c +DDOTKERNEL = ../riscv64/dot.c +CDOTKERNEL = ../riscv64/zdot.c +ZDOTKERNEL = ../riscv64/zdot.c + +SNRM2KERNEL = ../riscv64/nrm2.c +DNRM2KERNEL = ../riscv64/nrm2.c +CNRM2KERNEL = ../riscv64/znrm2.c +ZNRM2KERNEL = ../riscv64/znrm2.c + +SROTKERNEL = ../riscv64/rot.c +DROTKERNEL = ../riscv64/rot.c +CROTKERNEL = ../riscv64/zrot.c +ZROTKERNEL = ../riscv64/zrot.c + +SSCALKERNEL = ../riscv64/scal.c +DSCALKERNEL = ../riscv64/scal.c +CSCALKERNEL = ../riscv64/zscal.c +ZSCALKERNEL = ../riscv64/zscal.c + +SSWAPKERNEL = ../riscv64/swap.c +DSWAPKERNEL = ../riscv64/swap.c +CSWAPKERNEL = ../riscv64/zswap.c +ZSWAPKERNEL = ../riscv64/zswap.c + +SGEMVNKERNEL = ../riscv64/gemv_n.c +DGEMVNKERNEL = ../riscv64/gemv_n.c +CGEMVNKERNEL = ../riscv64/zgemv_n.c +ZGEMVNKERNEL = ../riscv64/zgemv_n.c + +SGEMVTKERNEL = ../riscv64/gemv_t.c +DGEMVTKERNEL = ../riscv64/gemv_t.c +CGEMVTKERNEL = ../riscv64/zgemv_t.c +ZGEMVTKERNEL = ../riscv64/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + +LSAME_KERNEL = ../generic/lsame.c + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif diff --git a/param.h b/param.h index d42724a57..4a7765012 100644 --- a/param.h +++ b/param.h @@ -2509,7 +2509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef RISCV64 +#ifdef RISCV64_GENERIC #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL From ef8e7d0279dfd1f9d9bec32b514a853d10bfdda7 Mon Sep 17 00:00:00 2001 From: damonyu Date: Thu, 15 Oct 2020 16:05:37 +0800 Subject: [PATCH 008/121] Add the support for RISC-V Vector. Change-Id: Iae7800a32f5af3903c330882cdf6f292d885f266 --- Makefile.prebuild | 4 + Makefile.riscv64 | 4 + Makefile.system | 5 +- TargetList.txt | 3 + c_check | 6 + common.h | 4 + common_riscv64.h | 98 + cpuid_riscv64.c | 113 ++ ctest.c | 4 + getarch.c | 33 + kernel/Makefile.L3 | 4 + kernel/generic/trmmkernel_16x4.c | 2092 ++++++++++++++++++++ kernel/generic/trmmkernel_8x4.c | 1317 +++++++++++++ kernel/generic/trmmkernel_8x8.c | 2207 ++++++++++++++++++++++ kernel/riscv64/KERNEL | 30 + kernel/riscv64/KERNEL.C910V | 190 ++ kernel/riscv64/KERNEL.RISCV64_GENERIC | 164 ++ kernel/riscv64/amax.c | 75 + kernel/riscv64/amax_vector.c | 245 +++ kernel/riscv64/amin.c | 75 + kernel/riscv64/amin_vector.c | 241 +++ kernel/riscv64/asum.c | 67 + kernel/riscv64/asum_vector.c | 131 ++ kernel/riscv64/axpby.c | 96 + kernel/riscv64/axpby_vector.c | 378 ++++ kernel/riscv64/axpy.c | 64 + kernel/riscv64/axpy_vector.c | 179 ++ kernel/riscv64/copy.c | 59 + kernel/riscv64/copy_vector.c | 148 ++ kernel/riscv64/dgemm_kernel_8x4_c910v.c | 977 ++++++++++ kernel/riscv64/dot.c | 64 + kernel/riscv64/dot_vector.c | 172 ++ kernel/riscv64/gemv_n.c | 67 + kernel/riscv64/gemv_n_vector.c | 146 ++ kernel/riscv64/gemv_t.c | 68 + kernel/riscv64/gemv_t_vector.c | 126 ++ kernel/riscv64/iamax.c | 77 + kernel/riscv64/iamax_vector.c | 191 ++ kernel/riscv64/iamin.c | 77 + kernel/riscv64/iamin_vector.c | 192 ++ kernel/riscv64/imax.c | 69 + kernel/riscv64/imax_vector.c | 176 ++ kernel/riscv64/imin.c | 67 + kernel/riscv64/imin_vector.c | 212 +++ kernel/riscv64/izamax.c | 81 + kernel/riscv64/izamax_vector.c | 246 +++ kernel/riscv64/izamin.c | 81 + kernel/riscv64/izamin_vector.c | 247 +++ kernel/riscv64/max.c | 65 + kernel/riscv64/max_vector.c | 116 ++ kernel/riscv64/min.c | 65 + kernel/riscv64/min_vector.c | 116 ++ kernel/riscv64/nrm2.c | 88 + kernel/riscv64/nrm2_vector.c | 220 +++ kernel/riscv64/nrm2_vector_dot.c | 128 ++ kernel/riscv64/omatcopy_cn.c | 90 + kernel/riscv64/omatcopy_ct.c | 89 + kernel/riscv64/omatcopy_rn.c | 90 + kernel/riscv64/omatcopy_rt.c | 62 + kernel/riscv64/rot.c | 62 + kernel/riscv64/rot_vector.c | 196 ++ kernel/riscv64/scal.c | 63 + kernel/riscv64/scal_vector.c | 133 ++ kernel/riscv64/sgemm_kernel_16x4_c910v.c | 1575 +++++++++++++++ kernel/riscv64/swap.c | 62 + kernel/riscv64/swap_vector.c | 173 ++ kernel/riscv64/symv_L.c | 70 + kernel/riscv64/symv_L_vector.c | 265 +++ kernel/riscv64/symv_U.c | 71 + kernel/riscv64/symv_U_vector.c | 264 +++ kernel/riscv64/zamax.c | 79 + kernel/riscv64/zamax_vector.c | 104 + kernel/riscv64/zamin.c | 79 + kernel/riscv64/zamin_vector.c | 104 + kernel/riscv64/zasum.c | 72 + kernel/riscv64/zasum_vector.c | 136 ++ kernel/riscv64/zaxpby.c | 118 ++ kernel/riscv64/zaxpby_vector.c | 197 ++ kernel/riscv64/zaxpy.c | 74 + kernel/riscv64/zaxpy_vector.c | 107 ++ kernel/riscv64/zcopy.c | 65 + kernel/riscv64/zcopy_vector.c | 92 + kernel/riscv64/zdot.c | 80 + kernel/riscv64/zdot_vector.c | 135 ++ kernel/riscv64/zgemv_n.c | 157 ++ kernel/riscv64/zgemv_n_vector.c | 175 ++ kernel/riscv64/zgemv_t.c | 140 ++ kernel/riscv64/zgemv_t_vector.c | 134 ++ kernel/riscv64/zhemv_LM_vector.c | 191 ++ kernel/riscv64/zhemv_UV_vector.c | 192 ++ kernel/riscv64/znrm2.c | 106 ++ kernel/riscv64/znrm2_vector.c | 278 +++ kernel/riscv64/zomatcopy_cn.c | 70 + kernel/riscv64/zomatcopy_cnc.c | 69 + kernel/riscv64/zomatcopy_ct.c | 71 + kernel/riscv64/zomatcopy_ctc.c | 71 + kernel/riscv64/zomatcopy_rn.c | 70 + kernel/riscv64/zomatcopy_rnc.c | 69 + kernel/riscv64/zomatcopy_rt.c | 72 + kernel/riscv64/zomatcopy_rtc.c | 72 + kernel/riscv64/zrot.c | 70 + kernel/riscv64/zrot_vector.c | 162 ++ kernel/riscv64/zscal.c | 88 + kernel/riscv64/zscal_vector.c | 152 ++ kernel/riscv64/zswap.c | 72 + kernel/riscv64/zswap_vector.c | 117 ++ lapack/laswp/riscv64/Makefile | 13 + param.h | 78 + test/Makefile | 6 + 109 files changed, 19571 insertions(+), 1 deletion(-) create mode 100644 Makefile.riscv64 create mode 100644 common_riscv64.h create mode 100644 cpuid_riscv64.c create mode 100644 kernel/generic/trmmkernel_16x4.c create mode 100644 kernel/generic/trmmkernel_8x4.c create mode 100644 kernel/generic/trmmkernel_8x8.c create mode 100644 kernel/riscv64/KERNEL create mode 100644 kernel/riscv64/KERNEL.C910V create mode 100644 kernel/riscv64/KERNEL.RISCV64_GENERIC create mode 100644 kernel/riscv64/amax.c create mode 100644 kernel/riscv64/amax_vector.c create mode 100644 kernel/riscv64/amin.c create mode 100644 kernel/riscv64/amin_vector.c create mode 100644 kernel/riscv64/asum.c create mode 100644 kernel/riscv64/asum_vector.c create mode 100644 kernel/riscv64/axpby.c create mode 100644 kernel/riscv64/axpby_vector.c create mode 100644 kernel/riscv64/axpy.c create mode 100644 kernel/riscv64/axpy_vector.c create mode 100644 kernel/riscv64/copy.c create mode 100644 kernel/riscv64/copy_vector.c create mode 100644 kernel/riscv64/dgemm_kernel_8x4_c910v.c create mode 100644 kernel/riscv64/dot.c create mode 100644 kernel/riscv64/dot_vector.c create mode 100644 kernel/riscv64/gemv_n.c create mode 100644 kernel/riscv64/gemv_n_vector.c create mode 100644 kernel/riscv64/gemv_t.c create mode 100644 kernel/riscv64/gemv_t_vector.c create mode 100644 kernel/riscv64/iamax.c create mode 100644 kernel/riscv64/iamax_vector.c create mode 100644 kernel/riscv64/iamin.c create mode 100644 kernel/riscv64/iamin_vector.c create mode 100644 kernel/riscv64/imax.c create mode 100644 kernel/riscv64/imax_vector.c create mode 100644 kernel/riscv64/imin.c create mode 100644 kernel/riscv64/imin_vector.c create mode 100644 kernel/riscv64/izamax.c create mode 100644 kernel/riscv64/izamax_vector.c create mode 100644 kernel/riscv64/izamin.c create mode 100644 kernel/riscv64/izamin_vector.c create mode 100644 kernel/riscv64/max.c create mode 100644 kernel/riscv64/max_vector.c create mode 100644 kernel/riscv64/min.c create mode 100644 kernel/riscv64/min_vector.c create mode 100644 kernel/riscv64/nrm2.c create mode 100644 kernel/riscv64/nrm2_vector.c create mode 100644 kernel/riscv64/nrm2_vector_dot.c create mode 100644 kernel/riscv64/omatcopy_cn.c create mode 100644 kernel/riscv64/omatcopy_ct.c create mode 100644 kernel/riscv64/omatcopy_rn.c create mode 100644 kernel/riscv64/omatcopy_rt.c create mode 100644 kernel/riscv64/rot.c create mode 100644 kernel/riscv64/rot_vector.c create mode 100644 kernel/riscv64/scal.c create mode 100644 kernel/riscv64/scal_vector.c create mode 100644 kernel/riscv64/sgemm_kernel_16x4_c910v.c create mode 100644 kernel/riscv64/swap.c create mode 100644 kernel/riscv64/swap_vector.c create mode 100644 kernel/riscv64/symv_L.c create mode 100644 kernel/riscv64/symv_L_vector.c create mode 100644 kernel/riscv64/symv_U.c create mode 100644 kernel/riscv64/symv_U_vector.c create mode 100644 kernel/riscv64/zamax.c create mode 100644 kernel/riscv64/zamax_vector.c create mode 100644 kernel/riscv64/zamin.c create mode 100644 kernel/riscv64/zamin_vector.c create mode 100644 kernel/riscv64/zasum.c create mode 100644 kernel/riscv64/zasum_vector.c create mode 100644 kernel/riscv64/zaxpby.c create mode 100644 kernel/riscv64/zaxpby_vector.c create mode 100644 kernel/riscv64/zaxpy.c create mode 100644 kernel/riscv64/zaxpy_vector.c create mode 100644 kernel/riscv64/zcopy.c create mode 100644 kernel/riscv64/zcopy_vector.c create mode 100644 kernel/riscv64/zdot.c create mode 100644 kernel/riscv64/zdot_vector.c create mode 100644 kernel/riscv64/zgemv_n.c create mode 100644 kernel/riscv64/zgemv_n_vector.c create mode 100644 kernel/riscv64/zgemv_t.c create mode 100644 kernel/riscv64/zgemv_t_vector.c create mode 100644 kernel/riscv64/zhemv_LM_vector.c create mode 100644 kernel/riscv64/zhemv_UV_vector.c create mode 100644 kernel/riscv64/znrm2.c create mode 100644 kernel/riscv64/znrm2_vector.c create mode 100644 kernel/riscv64/zomatcopy_cn.c create mode 100644 kernel/riscv64/zomatcopy_cnc.c create mode 100644 kernel/riscv64/zomatcopy_ct.c create mode 100644 kernel/riscv64/zomatcopy_ctc.c create mode 100644 kernel/riscv64/zomatcopy_rn.c create mode 100644 kernel/riscv64/zomatcopy_rnc.c create mode 100644 kernel/riscv64/zomatcopy_rt.c create mode 100644 kernel/riscv64/zomatcopy_rtc.c create mode 100644 kernel/riscv64/zrot.c create mode 100644 kernel/riscv64/zrot_vector.c create mode 100644 kernel/riscv64/zscal.c create mode 100644 kernel/riscv64/zscal_vector.c create mode 100644 kernel/riscv64/zswap.c create mode 100644 kernel/riscv64/zswap_vector.c create mode 100644 lapack/laswp/riscv64/Makefile diff --git a/Makefile.prebuild b/Makefile.prebuild index 48fb5e991..d6395da7b 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -41,6 +41,10 @@ ifeq ($(TARGET), I6500) TARGET_FLAGS = -mips64r6 endif +ifeq ($(TARGET), C910V) +TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 new file mode 100644 index 000000000..15d7b059c --- /dev/null +++ b/Makefile.riscv64 @@ -0,0 +1,4 @@ +ifeq ($(CORE), C910V) +CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v +FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static +endif diff --git a/Makefile.system b/Makefile.system index 461f7370b..fe2aecd82 100644 --- a/Makefile.system +++ b/Makefile.system @@ -724,7 +724,10 @@ endif endif endif - +ifeq ($(ARCH), riscv64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif # diff --git a/TargetList.txt b/TargetList.txt index 66eca4506..86177ebca 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -104,3 +104,6 @@ VORTEX ZARCH_GENERIC Z13 Z14 + +10.RISC-V 64: +RISCV64_GENERIC diff --git a/c_check b/c_check index 5ea93b75c..405963ae6 100644 --- a/c_check +++ b/c_check @@ -92,6 +92,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $defined = 0; @@ -136,6 +137,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $binary =32; } +if ($architecture eq "riscv64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); diff --git a/common.h b/common.h index a3ef99b59..faa75c447 100644 --- a/common.h +++ b/common.h @@ -437,6 +437,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif +#ifdef ARCH_RISCV64 +#include "common_riscv64.h" +#endif + #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif diff --git a/common_riscv64.h b/common_riscv64.h new file mode 100644 index 000000000..49368c613 --- /dev/null +++ b/common_riscv64.h @@ -0,0 +1,98 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_RISCV64 +#define COMMON_RISCV64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#if defined(C910V) +#include +#endif + +#endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c new file mode 100644 index 000000000..8a3209cb3 --- /dev/null +++ b/cpuid_riscv64.c @@ -0,0 +1,113 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define CPU_UNKNOWN 0 +#define CPU_C910V 1 + +static char *cpuname[] = { + "UNKOWN", + "C910V" +}; + +int detect(void){ + return CPU_UNKNOWN; +} + +char *get_corename(void){ + return cpuname[detect()]; +} + +void get_architecture(void){ + printf("RISCV64"); +} + +void get_subarchitecture(void){ +} + +void get_subdirname(void){ + printf("riscv64"); +} + +void get_cpuconfig(void){ + printf("#define UNKNOWN\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); +} + +void get_libname(void){ + printf("riscv64\n"); +} diff --git a/ctest.c b/ctest.c index cd84ab1bb..83a3b7d6c 100644 --- a/ctest.c +++ b/ctest.c @@ -153,6 +153,10 @@ ARCH_ARM ARCH_ARM64 #endif +#if defined(__riscv) +ARCH_RISCV64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif diff --git a/getarch.c b/getarch.c index e2c22d3a0..58465fb56 100644 --- a/getarch.c +++ b/getarch.c @@ -981,6 +981,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_GENERIC +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1252,6 +1266,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_C910V +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "C910V" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DC910V " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "c910v" +#define CORENAME "C910V" +#else +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1306,6 +1335,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2ba593c2e..893713769 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -25,6 +25,10 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif +ifeq ($(ARCH), riscv64) +USE_TRMM = 1 +endif + ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif diff --git a/kernel/generic/trmmkernel_16x4.c b/kernel/generic/trmmkernel_16x4.c new file mode 100644 index 000000000..7ea4e108c --- /dev/null +++ b/kernel/generic/trmmkernel_16x4.c @@ -0,0 +1,2092 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c new file mode 100644 index 000000000..b6aec131e --- /dev/null +++ b/kernel/riscv64/amax_vector.c @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + if (n <= 0 || inc_x <= 0) return(maxf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + MASK_T mask0, mask1; + FLOAT zero = 0.0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + if(gvl <= n/2){ + BLASLONG inc_xv = inc_x * gvl; + v_max = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + } + return(maxf); +} + + diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c new file mode 100644 index 000000000..78495a8e3 --- /dev/null +++ b/kernel/riscv64/amin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c new file mode 100644 index 000000000..53243ad56 --- /dev/null +++ b/kernel/riscv64/amin_vector.c @@ -0,0 +1,241 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + + MASK_T mask0, mask1; + FLOAT zero = 0.0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + for(i=0,j=0; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c new file mode 100644 index 000000000..7ab7484e8 --- /dev/null +++ b/kernel/riscv64/asum_vector.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + + MASK_T mask0, mask1; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +#define KERNEL8x4_I \ + "addi t1, %[PB], 1*8 \n\t"\ + "addi t2, %[PB], 2*8 \n\t"\ + "addi t3, %[PB], 3*8 \n\t"\ + "fld ft0, (%[PB]) \n\t"\ + "fld ft1, (t1) \n\t"\ + "fld ft2, (t2) \n\t"\ + "fld ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 2*8 \n\t"\ + "addi t5, %[PA], 4*8 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 6*8 \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "fld ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "fld ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "fld ft6, (t2) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "fld ft7, (t3) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t" + +#define KERNEL8x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "fld ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "fld ft5, (t1) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "fld ft6, (t2) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "fld ft7, (t3) \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL8x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "fld ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "fld ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "fld ft2, (t2) \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "fld ft3, (t3) \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL8x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t" + + + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + FLOAT *ptrba,*ptrbb; + + FLOAT loadb0,loadb1,loadb2,loadb3; + FLOAT load0,load1,load2,load3,load4,load5,load6,load7; + + FLOAT res0,res1,res2,res3; + FLOAT res4,res5,res6,res7; + FLOAT res8,res9,res10,res11; + FLOAT res12,res13,res14,res15; + + for (j=0; j 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + vy = VLEV_FLOAT(&y[j], gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else if(inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); + vy = VLEV_FLOAT(&y[j], gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); + vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + } + return(dot); +} + + diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c new file mode 100644 index 000000000..ef61b245b --- /dev/null +++ b/kernel/riscv64/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c new file mode 100644 index 000000000..3aa64afc9 --- /dev/null +++ b/kernel/riscv64/iamax_vector.c @@ -0,0 +1,191 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT maxf=0.0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c new file mode 100644 index 000000000..155292bd5 --- /dev/null +++ b/kernel/riscv64/iamin.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c new file mode 100644 index 000000000..608f19a00 --- /dev/null +++ b/kernel/riscv64/iamin_vector.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c new file mode 100644 index 000000000..5072dd16e --- /dev/null +++ b/kernel/riscv64/imax.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c new file mode 100644 index 000000000..44af7101b --- /dev/null +++ b/kernel/riscv64/imax_vector.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max = VLEV_FLOAT(&x[j], gvl); + + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + v_max_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c new file mode 100644 index 000000000..ffc65226e --- /dev/null +++ b/kernel/riscv64/imin.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c new file mode 100644 index 000000000..e6e0e9f9f --- /dev/null +++ b/kernel/riscv64/imin_vector.c @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#endif +*/ + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min = VLEV_FLOAT(&x[j], gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#endif +*/ + + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c new file mode 100644 index 000000000..8fe33e95b --- /dev/null +++ b/kernel/riscv64/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c new file mode 100644 index 000000000..62c95d973 --- /dev/null +++ b/kernel/riscv64/izamax_vector.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define RVV_EFLOAT RVV_E64 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + +#define RVV_M RVV_M8 + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT maxf=0.0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask0, mask1; + unsigned int gvl = 0; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = gvl * inc_x * 2; + BLASLONG ix = 0; + for(i=0,j=0; i < n/gvl; i++){ + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + + //index where element greater than v_max + mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_max_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_max_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#endif +*/ + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + j += gvl; + ix += inc_xv; + } + vx0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); + maxf = vx0[0]; + mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask0,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + v_max = VFADDVV_FLOAT(vx0, vx1, gvl); + vx0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); + FLOAT cur_maxf = vx0[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask0,gvl); + max_index = v_max_index[max_index]; + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c new file mode 100644 index 000000000..fb5a0d4cb --- /dev/null +++ b/kernel/riscv64/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c new file mode 100644 index 000000000..38eccf1b5 --- /dev/null +++ b/kernel/riscv64/izamin_vector.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define RVV_EFLOAT RVV_E64 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + +#define RVV_M RVV_M8 + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask0, mask1; + unsigned int gvl = 0; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = gvl * inc_x * 2; + BLASLONG ix = 0; + for(i=0,j=0; i < n/gvl; i++){ + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + + //index where element less than v_min + mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#endif +*/ + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + j += gvl; + ix += inc_xv; + } + vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); + vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); + minf = vx0[0]; + mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask0,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min_index = VMVVX_UINT(0, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + v_min = VFADDVV_FLOAT(vx0, vx1, gvl); + vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); + vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); + FLOAT cur_minf = vx0[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask0,gvl); + min_index = v_min_index[min_index]; + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c new file mode 100644 index 000000000..2ad956bc0 --- /dev/null +++ b/kernel/riscv64/max.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c new file mode 100644 index 000000000..4ef75452d --- /dev/null +++ b/kernel/riscv64/max_vector.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT maxf=-FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + BLASLONG idx = 0, inc_xv = inc_x * gvl; + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + } + return(maxf); +} + + diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c new file mode 100644 index 000000000..2812fe397 --- /dev/null +++ b/kernel/riscv64/min.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c new file mode 100644 index 000000000..83c965bfa --- /dev/null +++ b/kernel/riscv64/min_vector.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + for(i=0,j=0; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n <= 0 || inc_x <= 0) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c new file mode 100644 index 000000000..785c0d2f8 --- /dev/null +++ b/kernel/riscv64/nrm2_vector.c @@ -0,0 +1,220 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLEV_FLOAT vlev_float32xm4 +#define VLSEV_FLOAT vlsev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define ABS fabsf +#define MASK_T e32xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 +#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 +#define VMFIRSTM vmfirstm_e32xm4 +#define VFDIVVF_FLOAT vfdivvf_float32xm4 +#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLEV_FLOAT vlev_float64xm4 +#define VLSEV_FLOAT vlsev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define ABS fabs +#define MASK_T e64xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 +#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 +#define VMFIRSTM vmfirstm_e64xm4 +#define VFDIVVF_FLOAT vfdivvf_float64xm4 +#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if ( n < 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +#define KERNEL16x4_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "addi t2, %[PB], 2*4 \n\t"\ + "addi t3, %[PB], 3*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "flw ft2, (t2) \n\t"\ + "flw ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "addi t5, %[PA], 8*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 12*4 \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "flw ft7, (t3) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t" + +#define KERNEL16x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "flw ft7, (t3) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL16x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "flw ft2, (t2) \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "flw ft3, (t3) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL16x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t" + + +#define KERNEL8x4_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "addi t2, %[PB], 2*4 \n\t"\ + "addi t3, %[PB], 3*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "flw ft2, (t2) \n\t"\ + "flw ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "flw ft7, (t3) \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t3, t3, 4*4 \n\t" + + +#define KERNEL8x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft5, (t1) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "flw ft7, (t3) \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL8x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "flw ft2, (t2) \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "flw ft3, (t3) \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL8x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t" + + +#define KERNEL16x2_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "addi t5, %[PA], 8*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 12*4 \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t" + + +#define KERNEL16x2_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t" + + +#define KERNEL16x2_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t" + + +#define KERNEL16x2_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t" + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + FLOAT *ptrba,*ptrbb; + + FLOAT loadb0,loadb1,loadb2,loadb3; + FLOAT load0,load1,load2,load3,load4,load5,load6,load7; + + FLOAT res0,res1,res2,res3; + FLOAT res4,res5,res6,res7; + FLOAT res8,res9,res10,res11; + FLOAT res12,res13,res14,res15; + + for (j=0; j + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c new file mode 100644 index 000000000..9377bf4b9 --- /dev/null +++ b/kernel/riscv64/swap_vector.c @@ -0,0 +1,173 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VSEV_FLOAT vsev_float32xm8 +#define VSSEV_FLOAT vssev_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VSEV_FLOAT vsev_float64xm8 +#define VSSEV_FLOAT vssev_float64xm8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + BLASLONG stride_x, stride_y; + FLOAT_V_T vx0, vx1, vy0, vy1; + unsigned int gvl = 0; + + if (n < 0) return(0); + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + for(i=0,j=0; i 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLEV_FLOAT(&y[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += alpha * temp2; + a_ptr += lda; + } + }else if(inc_x == 1){ + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += alpha * temp2; + jy += inc_y; + a_ptr += lda; + } + }else if(inc_y == 1){ + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + inc_xv = inc_x * gvl; + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLEV_FLOAT(&y[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += alpha * temp2; + jx += inc_x; + a_ptr += lda; + } + }else{ + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += alpha * temp2; + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c new file mode 100644 index 000000000..b5a0c96e9 --- /dev/null +++ b/kernel/riscv64/symv_U.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if( m != offset ) + printf("Symv_U: m=%d offset=%d\n",m,offset); +#endif + + BLASLONG m1 = m - offset; + + jx = m1 * inc_x; + jy = m1 * inc_y; + + for (j=m1; j 0){ + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + } + }else if(inc_x == 1){ + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + iy = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jy += inc_y; + } + }else if(inc_y == 1){ + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jx += inc_x; + } + }else{ + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} + diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c new file mode 100644 index 000000000..a39bd7821 --- /dev/null +++ b/kernel/riscv64/zamax.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c new file mode 100644 index 000000000..a6c742b14 --- /dev/null +++ b/kernel/riscv64/zamax_vector.c @@ -0,0 +1,104 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + if (n <= 0 || inc_x <= 0) return(maxf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + MASK_T mask0, mask1; + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max = VFMVVF_FLOAT(0, gvl); + BLASLONG inc_xv = inc_x * gvl * 2; + for(; i maxf) + maxf = v_max[0]; + } + return(maxf); +} diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c new file mode 100644 index 000000000..02eab3e75 --- /dev/null +++ b/kernel/riscv64/zamin.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c new file mode 100644 index 000000000..44a7cf1dc --- /dev/null +++ b/kernel/riscv64/zamin_vector.c @@ -0,0 +1,104 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + MASK_T mask0, mask1; + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + BLASLONG inc_xv = inc_x * gvl * 2; + for(; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c new file mode 100644 index 000000000..d9fa88971 --- /dev/null +++ b/kernel/riscv64/zasum_vector.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + + MASK_T mask0, mask1; + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 = vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 = vx1[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 += vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 += vx1[0]; + } + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c new file mode 100644 index 000000000..6fe12c76c --- /dev/null +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLSEV_FLOAT vlsev_float32xm4 +#define VSSEV_FLOAT vssev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 +#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLSEV_FLOAT vlsev_float64xm4 +#define VSSEV_FLOAT vssev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 +#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 = vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 = vx1[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 += vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 += vx1[0]; + } + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c new file mode 100644 index 000000000..fc1c8b54a --- /dev/null +++ b/kernel/riscv64/znrm2.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c new file mode 100644 index 000000000..b0ebfa5f4 --- /dev/null +++ b/kernel/riscv64/znrm2_vector.c @@ -0,0 +1,278 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLEV_FLOAT vlev_float32xm4 +#define VLSEV_FLOAT vlsev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define ABS fabsf +#define MASK_T e32xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 +#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 +#define VMFIRSTM vmfirstm_e32xm4 +#define VFDIVVF_FLOAT vfdivvf_float32xm4 +#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLEV_FLOAT vlev_float64xm4 +#define VLSEV_FLOAT vlsev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define ABS fabs +#define MASK_T e64xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 +#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 +#define VMFIRSTM vmfirstm_e64xm4 +#define VFDIVVF_FLOAT vfdivvf_float64xm4 +#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if ( n < 0 ) return(0.0); +// if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c new file mode 100644 index 000000000..b655a968c --- /dev/null +++ b/kernel/riscv64/zswap_vector.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VSEV_FLOAT vsev_float32xm8 +#define VSSEV_FLOAT vssev_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VSEV_FLOAT vsev_float64xm8 +#define VSSEV_FLOAT vssev_float64xm8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + BLASLONG stride_x, stride_y; + FLOAT_V_T vx0, vx1, vy0, vy1; + unsigned int gvl = 0; + + if (n < 0) return(0); + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG n2 = n * 2; + if(gvl <= n2/2){ + for(i=0,j=0; i Date: Thu, 22 Oct 2020 22:00:00 -0400 Subject: [PATCH 009/121] reuse variables defined in Makefile.system --- Makefile.x86_64 | 7 +------ kernel/Makefile | 15 ++------------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index a849f0b01..49a9a0a23 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -47,8 +47,6 @@ ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake @@ -73,10 +71,7 @@ ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) CCOMMON_OPT += -mavx2 endif diff --git a/kernel/Makefile b/kernel/Makefile index e52781c6d..e811ed43d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,11 +12,6 @@ ifdef HAVE_SSSE3 CFLAGS += -mssse3 endif -ifeq ($(C_COMPILER), GCC) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -endif - ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as @@ -26,20 +21,14 @@ endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif endif ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) - GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) - GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) - GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) - GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) + GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif From dd6ebdfdab65e5235da4887c943f7639639d19af Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 23 Oct 2020 10:32:03 +0800 Subject: [PATCH 010/121] Refactor the performance measurement system --- benchmark/amax.c | 170 ++++++++++++++-------------------------- benchmark/amin.c | 166 ++++++++++++++------------------------- benchmark/asum.c | 180 +++++++++++++------------------------------ benchmark/axpby.c | 86 +-------------------- benchmark/axpy.c | 81 +------------------ benchmark/bench.h | 103 +++++++++++++++++++++++++ benchmark/cholesky.c | 50 +----------- benchmark/copy.c | 86 +-------------------- benchmark/dot.c | 84 +------------------- benchmark/geev.c | 80 +------------------ benchmark/gemm.c | 80 +------------------ benchmark/gemm3m.c | 83 +------------------- 12 files changed, 302 insertions(+), 947 deletions(-) create mode 100644 benchmark/bench.h diff --git a/benchmark/amax.c b/benchmark/amax.c index 19ae95c8b..29310dd71 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AMAX #ifdef COMPLEX #ifdef DOUBLE -#define AMAX BLASFUNC(dzamax) +#define AMAX BLASFUNC(dzamax) #else -#define AMAX BLASFUNC(scamax) +#define AMAX BLASFUNC(scamax) #endif #else #ifdef DOUBLE -#define AMAX BLASFUNC(damax) +#define AMAX BLASFUNC(damax) #else -#define AMAX BLASFUNC(samax) +#define AMAX BLASFUNC(samax) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; + int from = 1; + int to = 200; + int step = 1; - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg; + double time1, timeg; - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } #ifdef __linux @@ -152,37 +100,31 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); + timeg = 0; + fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) + { - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AMIN #ifdef COMPLEX #ifdef DOUBLE -#define AMIN BLASFUNC(dzamin) +#define AMIN BLASFUNC(dzamin) #else -#define AMIN BLASFUNC(scamin) +#define AMIN BLASFUNC(scamin) #endif #else #ifdef DOUBLE -#define AMIN BLASFUNC(damin) +#define AMIN BLASFUNC(damin) #else -#define AMIN BLASFUNC(samin) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - +#define AMIN BLASFUNC(samin) #endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 #endif - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; - int from = 1; - int to = 200; - int step = 1; + int from = 1; + int to = 200; + int step = 1; struct timeval start, stop; - double time1,timeg; + double time1, timeg; - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } #ifdef __linux @@ -151,39 +101,35 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; - fprintf(stderr, " %6d : ", (int)m); + fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) + { - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef ASUM #ifdef COMPLEX #ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) +#define ASUM BLASFUNC(dzasum) #else -#define ASUM BLASFUNC(scasum) +#define ASUM BLASFUNC(scasum) #endif #else #ifdef DOUBLE -#define ASUM BLASFUNC(dasum) +#define ASUM BLASFUNC(dasum) #else -#define ASUM BLASFUNC(sasum) +#define ASUM BLASFUNC(sasum) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; FLOAT result; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; - int from = 1; - int to = 200; - int step = 1; - -#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) - struct timeval start, stop; - double time1,timeg; -#else - struct timespec start = { 0, 0 }, stop = { 0, 0 }; + int from = 1; + int to = 200; + int step = 1; double time1, timeg; -#endif - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } - #ifdef __linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); + timeg = 0; - for (l=0; l1) - timeg /= loops; + if (loops > 1) + timeg /= loops; #ifdef COMPLEX fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); #else fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); #endif - } return 0; diff --git a/benchmark/axpby.c b/benchmark/axpby.c index 793ee7e40..d02d9a889 100644 --- a/benchmark/axpby.c +++ b/benchmark/axpby.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AXPBY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -129,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -176,16 +104,10 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AXPY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -127,8 +56,6 @@ int main(int argc, char *argv[]){ int from = 1; int to = 200; int step = 1; - - struct timespec start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +102,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - clock_gettime( CLOCK_REALTIME, &start); + begin(); AXPY (&m, alpha, x, &inc_x, y, &inc_y ); - clock_gettime( CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; + time1 = getsec(); timeg += time1; diff --git a/benchmark/bench.h b/benchmark/bench.h new file mode 100644 index 000000000..9055beaa7 --- /dev/null +++ b/benchmark/bench.h @@ -0,0 +1,103 @@ +#include +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + struct timeval start, stop; +#else + struct timespec start = { 0, 0 }, stop = { 0, 0 }; +#endif + +double getsec() +{ +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#else + return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; +#endif +} + +void begin() { +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + gettimeofday( &start, (struct timezone *)0); +#else + clock_gettime(CLOCK_REALTIME, &start); +#endif +} + +void end() { +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + gettimeofday( &stop, (struct timezone *)0); +#else + clock_gettime(CLOCK_REALTIME, &stop); +#endif +} \ No newline at end of file diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 5908b6085..65b20d039 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -71,41 +66,6 @@ double fabs(double); #endif #endif - - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - - static __inline double getmflops(int ratio, int m, double secs){ double mm = (double)m; @@ -145,7 +105,6 @@ int main(int argc, char *argv[]){ FLOAT maxerr; - struct timeval start, stop; double time1; argc--;argv++; @@ -220,20 +179,19 @@ int main(int argc, char *argv[]){ SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); - gettimeofday( &start, (struct timezone *)0); + begin(); POTRF(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); - maxerr = 0.; if (!(uplos & 1)) { for (j = 0; j < m; j++) { diff --git a/benchmark/copy.c b/benchmark/copy.c index eb5148fff..c5e447521 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef COPY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,11 +57,9 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1 = 0.0, timeg = 0.0; long nanos = 0; time_t seconds = 0; - struct timespec time_start = { 0, 0 }, time_end = { 0, 0 }; argc--;argv++; @@ -176,15 +103,10 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef DOT - #ifdef DOUBLE #define DOT BLASFUNC(ddot) #else #define DOT BLASFUNC(sdot) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -169,15 +96,12 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); result = DOT (&m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; + end(); + timeg += getsec(); } diff --git a/benchmark/geev.c b/benchmark/geev.c index 4fd2c8d6f..6e22cdfb6 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -36,13 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEEV @@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a, FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info ); #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; @@ -154,7 +83,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -223,7 +151,7 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step){ fprintf(stderr, " %6d : ", (int)m); - gettimeofday( &start, (struct timezone *)0); + begin(); lwork = -1; #ifndef COMPLEX @@ -239,14 +167,14 @@ int main(int argc, char *argv[]){ GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); #endif - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "failed to compute eigenvalues .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 8cd14bbed..35f5096f3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEMM @@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ IFLOAT *a, *b; @@ -139,7 +68,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1, timeg; argc--;argv++; @@ -228,14 +156,14 @@ int main(int argc, char *argv[]){ ldc = m; fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k); - gettimeofday( &start, (struct timezone *)0); + begin(); for (j=0; j -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEMM @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -187,16 +116,12 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; - + end(); + timeg += getsec(); } timeg /= loops; From 81fcfd5ed3ecc3a5f1aefec9ab202d487af85da0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 Oct 2020 23:28:29 +0200 Subject: [PATCH 011/121] Update version to 0.3.12.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53c1709a8..aeb4399e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 12) +set(OpenBLAS_PATCH_VERSION 12.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 2f9fc9be30e33efb21b7873c8ee060af190aabd8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 Oct 2020 23:29:05 +0200 Subject: [PATCH 012/121] Update version to 0.3.12.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index a4d11dc7c..1a0965d08 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.12 +VERSION = 0.3.12.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From fd7da56965a5af99f7ec2af161f0057f8b9d6bdb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 Oct 2020 12:01:50 +0100 Subject: [PATCH 013/121] Move definitions that are neither needed nor supported on SUNOS --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ba2bb55b9..f0521ab2d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1767,11 +1767,11 @@ int get_num_procs(void); int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) From eec517af0eb1bea187236ccd1072741fbabce01c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Oct 2020 00:21:56 +0100 Subject: [PATCH 014/121] Expressly enable neon for use with intrinsics if available --- Makefile.arm | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.arm b/Makefile.arm index fac6b56824..a27b58e84 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp FCOMMON_OPT += -mfpu=vfp endif + +ifdef HAVE_NEON +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon +endif From f917c26e83e040270cb98488b296a5c85cbb5ffb Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 26 Oct 2020 10:25:05 +0800 Subject: [PATCH 015/121] Refractoring remaining benchmark cases. --- benchmark/amin.c | 1 - benchmark/bench.h | 1 + benchmark/dot.c | 1 - benchmark/gemm3m.c | 1 - benchmark/gemv.c | 86 ++++------------------------------------- benchmark/ger.c | 86 +++-------------------------------------- benchmark/gesv.c | 83 ++-------------------------------------- benchmark/getri.c | 79 ++------------------------------------ benchmark/hbmv.c | 84 ++-------------------------------------- benchmark/hemm.c | 81 ++------------------------------------- benchmark/hemv.c | 82 ++------------------------------------- benchmark/her.c | 85 ++--------------------------------------- benchmark/her2.c | 85 ++--------------------------------------- benchmark/her2k.c | 81 ++------------------------------------- benchmark/herk.c | 83 ++-------------------------------------- benchmark/hpmv.c | 82 ++------------------------------------- benchmark/iamax.c | 80 ++------------------------------------ benchmark/iamin.c | 80 ++------------------------------------ benchmark/imax.c | 80 ++------------------------------------ benchmark/imin.c | 80 ++------------------------------------ benchmark/linpack.c | 85 ++++------------------------------------- benchmark/max.c | 80 ++------------------------------------ benchmark/min.c | 80 ++------------------------------------ benchmark/nrm2.c | 80 ++------------------------------------ benchmark/potrf.c | 56 +++++---------------------- benchmark/rot.c | 79 ++------------------------------------ benchmark/rotm.c | 82 +++------------------------------------ benchmark/scal.c | 80 ++------------------------------------ benchmark/spmv.c | 81 ++------------------------------------- benchmark/spr.c | 82 ++------------------------------------- benchmark/spr2.c | 80 ++------------------------------------ benchmark/swap.c | 79 ++------------------------------------ benchmark/symm.c | 80 ++------------------------------------ benchmark/symv.c | 80 ++------------------------------------ benchmark/syr.c | 80 ++------------------------------------ benchmark/syr2.c | 81 ++------------------------------------- benchmark/syr2k.c | 79 ++------------------------------------ benchmark/syrk.c | 80 ++------------------------------------ benchmark/tpmv.c | 48 ++--------------------- benchmark/tpsv.c | 48 ++--------------------- benchmark/trmm.c | 79 ++------------------------------------ benchmark/trmv.c | 48 ++--------------------- benchmark/trsm.c | 79 ++------------------------------------ benchmark/trsv.c | 87 ++---------------------------------------- benchmark/zdot-intel.c | 83 +++------------------------------------- benchmark/zdot.c | 81 ++------------------------------------- 46 files changed, 184 insertions(+), 3114 deletions(-) diff --git a/benchmark/amin.c b/benchmark/amin.c index 4bcff9bba..54a1d266a 100644 --- a/benchmark/amin.c +++ b/benchmark/amin.c @@ -57,7 +57,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timeval start, stop; double time1, timeg; argc--; diff --git a/benchmark/bench.h b/benchmark/bench.h index 9055beaa7..1f9b8986c 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -67,6 +67,7 @@ static void *huge_malloc(BLASLONG size){ return address; } + #define malloc huge_malloc #endif diff --git a/benchmark/dot.c b/benchmark/dot.c index 86f4e3828..72a756249 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -49,7 +49,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c index 76b8176b2..f505ca049 100644 --- a/benchmark/gemm3m.c +++ b/benchmark/gemm3m.c @@ -62,7 +62,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; diff --git a/benchmark/gemv.c b/benchmark/gemv.c index fb1f541d3..a0001277a 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef GEMV @@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -211,10 +139,10 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); + time1 = getsec(); timeg += time1; } @@ -248,10 +176,10 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); + time1 = getsec(); timeg += time1; } diff --git a/benchmark/ger.c b/benchmark/ger.c index d53d328f0..7ce08c3ad 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GER @@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -131,7 +59,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -198,16 +125,13 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -66,71 +61,6 @@ double fabs(double); #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -142,7 +72,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -194,22 +123,18 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GESV (&m, &m, a, &m, ipiv, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); - - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - + end(); + time1 = getsec(); fprintf(stderr, "%10.2f MFlops %10.6f s\n", COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); - } return 0; diff --git a/benchmark/getri.c b/benchmark/getri.c index a07014768..98a860906 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef GETRF #undef GETRI @@ -72,71 +67,6 @@ extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info); -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*work; @@ -148,7 +78,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -205,21 +134,21 @@ int main(int argc, char *argv[]){ exit(1); } - gettimeofday( &start, (struct timezone *)0); + begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index 60ba9fb89..35249bdf9 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HBMV - #ifdef DOUBLE #define HBMV BLASFUNC(zhbmv) #else #define HBMV BLASFUNC(chbmv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) { - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -125,7 +52,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -186,15 +112,13 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); - timeg += time1; + timeg += getsec(); } diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 2bc165458..a0a9985ad 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HEMM @@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HEMM BLASFUNC(chemm) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -126,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -170,13 +97,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/hemv.c b/benchmark/hemv.c index 98618a04e..ad130ddd0 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HEMV - #ifdef DOUBLE #define HEMV BLASFUNC(zhemv) #else #define HEMV BLASFUNC(chemv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -124,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -182,13 +108,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/her.c b/benchmark/her.c index 010f8120d..cd1fb7f48 100644 --- a/benchmark/her.c +++ b/benchmark/her.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER - #ifdef DOUBLE #define HER BLASFUNC(zher) #else #define HER BLASFUNC(cher) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x; @@ -126,8 +53,6 @@ int main(int argc, char *argv[]){ int from = 1; int to = 200; int step = 1; - - struct timeval start, stop; double time1; argc--;argv++; @@ -166,15 +91,13 @@ int main(int argc, char *argv[]){ x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HER (&uplo, &m, alpha, x, &incx, a, &m ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); - gettimeofday( &start, (struct timezone *)0); + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/her2.c b/benchmark/her2.c index 0f80f3ed9..d87bfd466 100644 --- a/benchmark/her2.c +++ b/benchmark/her2.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER2 - #ifdef DOUBLE #define HER2 BLASFUNC(zher2) #else #define HER2 BLASFUNC(cher2) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -127,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -169,16 +95,13 @@ int main(int argc, char *argv[]){ y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); - + begin(); HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); - gettimeofday( &start, (struct timezone *)0); + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 021873beb..d3cdce696 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER2K #ifdef DOUBLE @@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HER2K BLASFUNC(cher2k) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -125,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -169,13 +96,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/herk.c b/benchmark/herk.c index c09d35c1f..628dc2c11 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HERK - #ifdef DOUBLE #define HERK BLASFUNC(zherk) #else #define HERK BLASFUNC(cherk) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *c; @@ -127,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -167,18 +93,17 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); - } return 0; diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index b0157094e..907e2adc4 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HPMV - #ifdef DOUBLE #define HPMV BLASFUNC(zhpmv) #else #define HPMV BLASFUNC(chpmv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) { - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -124,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -183,13 +109,13 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/iamax.c b/benchmark/iamax.c index c87044ab4..15618cbcc 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IAMAX @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IAMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/iamin.c b/benchmark/iamin.c index e7c8e59e4..a57638ecc 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IAMIN @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IAMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/imax.c b/benchmark/imax.c index b56ef64ba..b96b17167 100644 --- a/benchmark/imax.c +++ b/benchmark/imax.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IMAX @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/imin.c b/benchmark/imin.c index 4a92c8bd0..095eacca9 100644 --- a/benchmark/imin.c +++ b/benchmark/imin.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IMIN @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 661a44175..202035245 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -72,71 +67,6 @@ double fabs(double); #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -151,7 +81,6 @@ int main(int argc, char *argv[]){ FLOAT maxerr; - struct timeval start, stop; double time1, time2; argc--;argv++; @@ -198,31 +127,31 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GETRF (&m, &m, a, &m, ipiv, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); - gettimeofday( &start, (struct timezone *)0); + begin(); GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time2 = getsec(); maxerr = 0.; diff --git a/benchmark/max.c b/benchmark/max.c index a19a386a2..301b943a5 100644 --- a/benchmark/max.c +++ b/benchmark/max.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NAMAX @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NAMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/min.c b/benchmark/min.c index 4df8fb0fd..39df37a29 100644 --- a/benchmark/min.c +++ b/benchmark/min.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NAMIN @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NAMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c index 0f416621a..cd64d564a 100644 --- a/benchmark/nrm2.c +++ b/benchmark/nrm2.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NRM2 @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NRM2 (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/potrf.c b/benchmark/potrf.c index cb4c23bab..116d0cca5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -86,37 +81,7 @@ double fabs(double); // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info); // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info); -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif int main(int argc, char *argv[]){ @@ -141,7 +106,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -217,18 +181,18 @@ int main(int argc, char *argv[]){ SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); - gettimeofday( &start, (struct timezone *)0); + begin(); POTRF(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potrf info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; if ( btest == 'S' ) @@ -240,17 +204,17 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; } @@ -258,18 +222,18 @@ int main(int argc, char *argv[]){ if ( btest == 'I' ) { - gettimeofday( &start, (struct timezone *)0); + begin(); POTRI(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potri info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; } diff --git a/benchmark/rot.c b/benchmark/rot.c index 69698988d..15b630e36 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef ROT @@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -133,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -179,13 +108,13 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef ROTM @@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ROTM BLASFUNC(srotm) #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) -{ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid = - shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT | 0600)) < 0) { - printf("Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf("Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -122,7 +51,7 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timeval start, stop; + double time1, timeg; argc--; @@ -188,14 +117,13 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - gettimeofday(&start, (struct timezone *)0); + begin(); ROTM(&m, x, &inc_x, y, &inc_y, param); - gettimeofday(&stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + - (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/scal.c b/benchmark/scal.c index 8bd62c77c..8de6cfd04 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SCAL @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,7 +57,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -174,13 +102,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SCAL (&m, alpha, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/spmv.c b/benchmark/spmv.c index cff504d3b..e4dcbf4ae 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SPMV - #ifndef COMPLEX #ifdef DOUBLE @@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -135,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -193,13 +120,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/spr.c b/benchmark/spr.c index 5dcaa4f8b..2fc9994f8 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SPR @@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SPR BLASFUNC(sspr) #endif - - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*c; @@ -129,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -173,13 +99,13 @@ int main(int argc, char *argv[]){ c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPR (&uplo, &m, alpha, c, &inc_x, a); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/spr2.c b/benchmark/spr2.c index a5f2791f7..8f194e83a 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SPR2 @@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*b,*c; @@ -129,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -182,13 +110,13 @@ int main(int argc, char *argv[]){ c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/swap.c b/benchmark/swap.c index 76d545995..64ebe5e9b 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SWAP @@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +104,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SWAP (&m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/symm.c b/benchmark/symm.c index bb9849eb5..1c6d91d00 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYMM @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -181,13 +109,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/symv.c b/benchmark/symv.c index e4c892b5a..0a35aaef0 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYMV @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -134,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -192,13 +120,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/syr.c b/benchmark/syr.c index a9dd293e6..ebbf2bd3c 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SYR @@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x,*a; @@ -124,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -165,13 +93,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR (&uplo, &m, alpha, x, &inc_x, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syr2.c b/benchmark/syr2.c index 9efbca315..acbc86987 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYR2 @@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYR2 BLASFUNC(ssyr2) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y, *a; @@ -125,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -174,13 +101,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index a906559eb..3895c2861 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SYR2K @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -137,7 +67,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -181,13 +110,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 0fbb943f6..82606a21a 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYRK @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *c; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -177,13 +105,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c index fe9d07534..41f2e0fb8 100644 --- a/benchmark/tpmv.c +++ b/benchmark/tpmv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TPMV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c index 8472ac261..ebfa29692 100644 --- a/benchmark/tpsv.c +++ b/benchmark/tpsv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TPSV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 23af122b4..3ab9fc255 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRMM @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -141,7 +71,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -180,13 +109,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops %10.6f sec\n", diff --git a/benchmark/trmv.c b/benchmark/trmv.c index 46641b3e4..0e8088b54 100644 --- a/benchmark/trmv.c +++ b/benchmark/trmv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRMV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 17676946a..d2ebd7f54 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRSM @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -151,7 +81,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -196,13 +125,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 1734e2adb..66ac3a3c7 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include -#include "common.h" - +#include "bench.h" #undef GEMV #undef TRSV @@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x; @@ -133,7 +61,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timespec time_start, time_end; time_t seconds = 0; double time1,timeg; @@ -189,19 +116,13 @@ int main(int argc, char *argv[]){ for(l =0;l< loops;l++){ - clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start); - + begin(); TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); - - clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end); - nanos = time_end.tv_nsec - time_start.tv_nsec; - seconds = time_end.tv_sec - time_start.tv_sec; - - time1 = seconds + nanos /1.e9; + end(); + time1 = getsec(); timeg += time1; } - timeg /= loops; long long muls = n*(n+1)/2.0; long long adds = (n - 1.0)*n/2.0; diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c index ba1515365..06cdde13a 100644 --- a/benchmark/zdot-intel.c +++ b/benchmark/zdot-intel.c @@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#define RETURN_BY_STACK 1 -#include "common.h" +#include "bench.h" +#define RETURN_BY_STACK 1 #undef DOT - #ifdef DOUBLE #define DOT BLASFUNC(zdotu) #else #define DOT BLASFUNC(cdotu) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -123,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -170,13 +97,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); DOT (&result, &m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/zdot.c b/benchmark/zdot.c index fa624e859..23b3efcad 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef DOT @@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DOT BLASFUNC(cdotu) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -122,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -169,15 +96,15 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); #ifdef RETURN_BY_STACK DOT (&result , &m, x, &inc_x, y, &inc_y ); #else result = DOT (&m, x, &inc_x, y, &inc_y ); #endif - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; From c24ba8b1dd155b30eb5b7c4e7dc7b38c9e6597e3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 26 Oct 2020 13:24:59 -0500 Subject: [PATCH 016/121] Optimize saxpy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/KERNEL.POWER10 | 2 +- kernel/power/saxpy_microk_power10.c | 181 ++++++++++++++++++++++++++++ kernel/power/saxpy_power10.c | 119 ++++++++++++++++++ 3 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 kernel/power/saxpy_microk_power10.c create mode 100644 kernel/power/saxpy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 86df7e3a2..1e514fcc9 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -141,7 +141,7 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -SAXPYKERNEL = saxpy.c +SAXPYKERNEL = saxpy_power10.c DAXPYKERNEL = daxpy_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power9.S diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c new file mode 100644 index 000000000..6ede1dcdd --- /dev/null +++ b/kernel/power/saxpy_microk_power10.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void saxpy_kernel_64(long n, float *x, float *y, float alpha) +{ + __vector float t0 = {alpha, alpha,alpha, alpha}; + + __asm__ + ( + + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 40, 64(%2) \n\t" + "lxvp 42, 96(%2) \n\t" + "lxvp 48, 128(%2) \n\t" + "lxvp 50, 160(%2) \n\t" + "lxvp 52, 192(%2) \n\t" + "lxvp 54, 224(%2) \n\t" + + "lxvp 36, 0(%3) \n\t" + "lxvp 38, 32(%3) \n\t" + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -64 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 36, 32, %x4 \n\t" + "xvmaddasp 37, 33, %x4 \n\t" + + "lxvp 32, 0(%2) \n\t" + "stxvp 36, 0(%3) \n\t" + + "xvmaddasp 38, 34, %x4 \n\t" + "xvmaddasp 39, 35, %x4 \n\t" + + "lxvp 34, 32(%2) \n\t" + "stxvp 38, 32(%3) \n\t" + + "lxvp 36, 256(%3) \n\t" + "lxvp 38, 288(%3) \n\t" + + "xvmaddasp 44, 40, %x4 \n\t" + "xvmaddasp 45, 41, %x4 \n\t" + + "lxvp 40, 64(%2) \n\t" + "stxvp 44, 64(%3) \n\t" + + "xvmaddasp 46, 42, %x4 \n\t" + "xvmaddasp 47, 43, %x4 \n\t" + + "lxvp 42, 96(%2) \n\t" + "stxvp 46, 96(%3) \n\t" + + "lxvp 44, 320(%3) \n\t" + "lxvp 46, 352(%3) \n\t" + + "xvmaddasp 56, 48, %x4 \n\t" + "xvmaddasp 57, 49, %x4 \n\t" + + "lxvp 48, 128(%2) \n\t" + "stxvp 56, 128(%3) \n\t" + + "xvmaddasp 58, 50, %x4 \n\t" + "xvmaddasp 59, 51, %x4 \n\t" + + "lxvp 50, 160(%2) \n\t" + "stxvp 58, 160(%3) \n\t" + + "lxvp 56, 384(%3) \n\t" + "lxvp 58, 416(%3) \n\t" + + "xvmaddasp 60, 52, %x4 \n\t" + "xvmaddasp 61, 53, %x4 \n\t" + + "lxvp 52, 192(%2) \n\t" + "stxvp 60, 192(%3) \n\t" + + "xvmaddasp 62, 54, %x4 \n\t" + "xvmaddasp 63, 55, %x4 \n\t" + + "lxvp 54, 224(%2) \n\t" + "stxvp 62, 224(%3) \n\t" + + "lxvp 60, 448(%3) \n\t" + "lxvp 62, 480(%3) \n\t" + + "addi %2, %2, 256 \n\t" + "addi %3, %3, 256 \n\t" + + "addic. %1, %1, -64 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 36, 32, %x4 \n\t" + "xvmaddasp 37, 33, %x4 \n\t" + "xvmaddasp 38, 34, %x4 \n\t" + "xvmaddasp 39, 35, %x4 \n\t" + + "xvmaddasp 44, 40, %x4 \n\t" + "xvmaddasp 45, 41, %x4 \n\t" + "xvmaddasp 46, 42, %x4 \n\t" + "xvmaddasp 47, 43, %x4 \n\t" + + "xvmaddasp 56, 48, %x4 \n\t" + "xvmaddasp 57, 49, %x4 \n\t" + "xvmaddasp 58, 50, %x4 \n\t" + "xvmaddasp 59, 51, %x4 \n\t" + + "xvmaddasp 60, 52, %x4 \n\t" + "xvmaddasp 61, 53, %x4 \n\t" + "xvmaddasp 62, 54, %x4 \n\t" + "xvmaddasp 63, 55, %x4 \n\t" + "stxvp 36, 0(%3) \n\t" + "stxvp 38, 32(%3) \n\t" + "stxvp 44, 64(%3) \n\t" + "stxvp 46, 96(%3) \n\t" + "stxvp 56, 128(%3) \n\t" + "stxvp 58, 160(%3) \n\t" + "stxvp 60, 192(%3) \n\t" + "stxvp 62, 224(%3) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "wa" (t0), // 4 + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); + +} + + diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c new file mode 100644 index 000000000..8c7c22390 --- /dev/null +++ b/kernel/power/saxpy_power10.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "saxpy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_8 +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG register i = 0; + + while(i < n) + { + y[i] += alpha * x[i]; + y[i+1] += alpha * x[i+1]; + y[i+2] += alpha * x[i+2]; + y[i+3] += alpha * x[i+3]; + y[i+4] += alpha * x[i+4]; + y[i+5] += alpha * x[i+5]; + y[i+6] += alpha * x[i+6]; + y[i+7] += alpha * x[i+7]; + i+=8 ; + + } + +} +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y, da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + From 878b6d1f410c740372a9b5addf6c5033d893cc12 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Oct 2020 21:35:40 +0100 Subject: [PATCH 017/121] Remove spurious expr in flang version check --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 30d8f4ccf..6d985786d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -855,7 +855,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) ifeq ($(FLANG_VENDOR),AOCC) FCOMMON_OPT += -fno-unroll-loops endif From 6a1f3e40af7bd018f47afbf8fc543327b6552e48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Oct 2020 21:37:04 +0100 Subject: [PATCH 018/121] Remove debug printout of object list --- interface/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/Makefile b/interface/Makefile index 6b247b49f..7b60111f9 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -507,7 +507,7 @@ ifneq ($(BUILD_COMPLEX16),1) endif FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -$(info FUNCOBJS = {[$(FUNCOBJS)]} ) + ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif From b937d78a6d87dbda61a14788c33d48b9c885c6ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Oct 2020 17:51:32 +0100 Subject: [PATCH 019/121] Try to read cpu information from /sys/devices/system/cpu/cpu0 if HWCAP_CPUID fails --- driver/others/dynamic_arm64.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index be22b247c..007a221db 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) { static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; + char coremsg[128]; + +#if (!defined OS_LINUX && !defined OS_ANDROID) + return NULL; +#endif -#if (defined OS_LINUX || defined OS_ANDROID) if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { - char coremsg[128]; +#ifdef __linux + FILE *infile; + char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; + p = (char *) NULL ; + infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r"); + if (!infile) return NULL; + fgets(buffer, sizeof(buffer), infile); + midr_el1=strtoul(buffer,NULL,16); + fclose(infile); +#else snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); openblas_warning(1, coremsg); return NULL; - } -#else - return NULL; #endif - - get_cpu_ftr(MIDR_EL1, midr_el1); + } else { + get_cpu_ftr(MIDR_EL1, midr_el1); + } /* * MIDR_EL1 * @@ -219,6 +230,9 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + default: + snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); + openblas_warning(1, coremsg); } return NULL; } From e8cbf0fc50547e5b50bc2f15549515f64767d104 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Oct 2020 23:01:19 +0100 Subject: [PATCH 020/121] Output predefined HAVE_ entries to Makefile.conf for ARM with specified TARGET --- getarch.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 3f1448305..ab90f36d9 100644 --- a/getarch.c +++ b/getarch.c @@ -1405,8 +1405,41 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); -#if defined(__arm__) && !defined(FORCE) +#if defined(__arm__) +#if !defined(FORCE) + fprintf(stderr,"get features!\n"); get_features(); +#else + fprintf(stderr,"split archconfig!\n"); + sprintf(buffer, "%s", ARCHCONFIG); + + p = &buffer[0]; + + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + if (*p != 'H') { + while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; } + if (*p == '-') continue; + } + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf("="); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + if ((*p == ' ') || (*p =='\0')) printf("=1\n"); + } + } + } else p ++; + } +#endif #endif From a7b1f9b1bbbfefb3f8b9dae126afdf054be97eda Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Wed, 28 Oct 2020 08:49:12 +0800 Subject: [PATCH 021/121] Implementation of BF16 based gemv 1. Add a new API -- sbgemv to support bfloat16 based gemv 2. Implement a generic kernel for sbgemv 3. Implement an avx512-bf16 based kernel for sbgemv Signed-off-by: Chen, Guobing --- cblas.h | 1 + cmake/kernel.cmake | 4 +- common_interface.h | 2 + common_level2.h | 4 + common_macro.h | 10 +- common_param.h | 4 +- common_sb.h | 4 + driver/level2/Makefile | 16 +- driver/level2/sbgemv_thread.c | 149 + driver/others/blas_server_omp.c | 1 - exports/gensymbol | 4 +- interface/Makefile | 17 +- interface/gemv.c | 1 - interface/sbgemv.c | 210 ++ kernel/Makefile.L2 | 22 + kernel/setparam-ref.c | 2 +- kernel/x86_64/KERNEL | 8 + kernel/x86_64/bf16_common_macros.h | 795 +++++ kernel/x86_64/sbgemv_n.c | 137 + kernel/x86_64/sbgemv_n_microk_cooperlake.c | 76 + .../sbgemv_n_microk_cooperlake_template.c | 234 ++ kernel/x86_64/sbgemv_t.c | 142 + kernel/x86_64/sbgemv_t_microk_cooperlake.c | 202 ++ .../sbgemv_t_microk_cooperlake_template.c | 3082 +++++++++++++++++ 24 files changed, 5111 insertions(+), 16 deletions(-) create mode 100644 driver/level2/sbgemv_thread.c create mode 100644 interface/sbgemv.c create mode 100644 kernel/x86_64/bf16_common_macros.h create mode 100644 kernel/x86_64/sbgemv_n.c create mode 100644 kernel/x86_64/sbgemv_n_microk_cooperlake.c create mode 100644 kernel/x86_64/sbgemv_n_microk_cooperlake_template.c create mode 100644 kernel/x86_64/sbgemv_t.c create mode 100644 kernel/x86_64/sbgemv_t_microk_cooperlake.c create mode 100644 kernel/x86_64/sbgemv_t_microk_cooperlake_template.c diff --git a/cblas.h b/cblas.h index bf310bed2..da00d46d6 100644 --- a/cblas.h +++ b/cblas.h @@ -393,6 +393,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); /* dot production of BFLOAT16 input arrays, and output as float */ float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); +void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 7d7f5ffda..0c102bae5 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -184,8 +184,8 @@ macro(SetDefaultL2) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) - set(SBGEMVNKERNEL ../arm/gemv_n.c) - set(SBGEMVTKERNEL ../arm/gemv_t.c) + set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) set(SHGERKERNEL ../generic/ger.c) endif () endmacro () diff --git a/common_interface.h b/common_interface.h index 032877fe1..b9ebb2772 100644 --- a/common_interface.h +++ b/common_interface.h @@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *, + bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, diff --git a/common_level2.h b/common_level2.h index 640d4a073..9a5ebb4d9 100644 --- a/common_level2.h +++ b/common_level2.h @@ -44,6 +44,10 @@ extern "C" { #endif +int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); +int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); +int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); +int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); diff --git a/common_macro.h b/common_macro.h index 54deed57c..c6ea1bfd9 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,10 +646,12 @@ #elif defined(BFLOAT16) -#define D_TO_BF16_K SBDTOBF16_K -#define D_BF16_TO_K DBF16TOD_K -#define S_TO_BF16_K SBSTOBF16_K -#define S_BF16_TO_K SBF16TOS_K +#define D_TO_BF16_K SBDTOBF16_K +#define D_BF16_TO_K DBF16TOD_K +#define S_TO_BF16_K SBSTOBF16_K +#define S_BF16_TO_K SBF16TOS_K +#define SBGEMV_N SBGEMV_N_K +#define SBGEMV_T SBGEMV_T_K #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K diff --git a/common_param.h b/common_param.h index b50e4ff80..3e3ae06f8 100644 --- a/common_param.h +++ b/common_param.h @@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); + int (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); diff --git a/common_sb.h b/common_sb.h index 66968ab00..9976e812e 100644 --- a/common_sb.h +++ b/common_sb.h @@ -8,6 +8,8 @@ #define SBDTOBF16_K sbdtobf16_k #define SBF16TOS_K sbf16tos_k #define DBF16TOD_K dbf16tod_k +#define SBGEMV_N_K sbgemv_n +#define SBGEMV_T_K sbgemv_t #define SBGEMM_ONCOPY sbgemm_oncopy #define SBGEMM_OTCOPY sbgemm_otcopy @@ -29,6 +31,8 @@ #define SBDTOBF16_K gotoblas -> sbdtobf16_k #define SBF16TOS_K gotoblas -> sbf16tos_k #define DBF16TOD_K gotoblas -> dbf16tod_k +#define SBGEMV_N_K gotoblas -> sbgemv_n +#define SBGEMV_T_K gotoblas -> sbgemv_t #define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy #define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 7212d6662..caecf4f97 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -413,7 +413,13 @@ XBLASOBJS += \ xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ - xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ + xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \ + sbgemv_thread_t$(TSUFFIX).$(SUFFIX) +endif endif @@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) +sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) +endif + + include ../../Makefile.tail diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c new file mode 100644 index 000000000..534c60f95 --- /dev/null +++ b/driver/level2/sbgemv_thread.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANSA +#define SBGEMV SBGEMV_N +#else +#define SBGEMV SBGEMV_T +#endif + +static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){ + + bfloat16 *a, *x; + float *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to, n_from, n_to; + + a = (bfloat16 *)args->a; + x = (bfloat16 *)args->b; + y = (float *)args->c; + + lda = args->lda; + incx = args->ldb; + incy = args->ldc; + +#ifndef TRANSA // N + m_from = *(range_m + 0); + m_to = *(range_m + 1); + n_from = 0; + n_to = args -> n; + a += m_from; + y += m_from * incy; +#else // T + m_from = 0; + m_to = args->m; + n_from = *(range_n + 0); + n_to = *(range_n + 1); + a += n_from * lda; + y += n_from * incy; +#endif + + SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy); + + return 0; +} + +int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads) +{ + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + +#ifndef TRANSA + BLASLONG width_for_split = m; +#else + BLASLONG width_for_split = n; +#endif + + BLASLONG BLOCK_WIDTH = width_for_split/threads; + + int mode = BLAS_BFLOAT16 | BLAS_REAL; + + args.m = m; + args.n = n; + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)y; + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + args.alpha = (void *)α + args.beta = (void *)β + + range[0] = 0; + + int thread_idx; + + for (thread_idx=0; thread_idxsb=sb; } } diff --git a/exports/gensymbol b/exports/gensymbol index 22e470da5..857a17a9e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @blasobjs = (lsame, xerbla); -@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/interface/Makefile b/interface/Makefile index 7b60111f9..7b0bf1792 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -48,6 +48,7 @@ SBLAS3OBJS = \ ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) +SBBLAS2OBJS = sbgemv.$(SUFFIX) SBBLAS3OBJS = sbgemm.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -284,6 +285,7 @@ CSBLAS3OBJS = \ ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -382,6 +384,7 @@ SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS) +SBBLAS2OBJS += $(CSBBLAS2OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) @@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) +SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -538,7 +541,7 @@ clean :: level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) +level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) @@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) $(CFLAGS) -c $< -o $(@F) +endif + ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< @@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< diff --git a/interface/gemv.c b/interface/gemv.c index c9d52cd69..d5d739fb1 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order, } #endif - //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; diff --git a/interface/sbgemv.c b/interface/sbgemv.c new file mode 100644 index 000000000..89debe82d --- /dev/null +++ b/interface/sbgemv.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "l1param.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#define ERROR_NAME "SBGEMV " + +#ifdef SMP +static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY) +{ + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + float alpha = *ALPHA; + float beta = *BETA; +#ifdef SMP + int nthreads; +#endif + + int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { + SBGEMV_N, SBGEMV_T, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') {i = 0;} + if (trans == 'T') {i = 1;} + if (trans == 'R') {i = 0;} + if (trans == 'C') {i = 1;} + + if (incy == 0) {info = 11;} + if (incx == 0) {info = 8;} + if (lda < MAX(1, m)) {info = 6;} + if (n < 0) {info = 3;} + if (m < 0) {info = 2;} + if (i < 0) {info = 1;} + + trans = i; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy) +{ + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { + SBGEMV_N, SBGEMV_T, + }; + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { // Column Major + if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) { + trans = 0; + } else if (TransA == CblasTrans || TransA == CblasConjTrans) { + trans = 1; + } + } else { // Row Major + if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) { + trans = 1; + } else if (TransA == CblasTrans || TransA == CblasConjTrans) { + trans = 0; + } + + t = n; + n = m; + m = t; + } + + info = -1; + + if (incy == 0) {info = 11;} + if (incx == 0) {info = 8;} + if (lda < MAX(1, m)) {info = 6;} + if (n < 0) {info = 3;} + if (m < 0) {info = 2;} + if (trans < 0) {info = 1;} + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + if (trans) { + lenx = m; + leny = n; + } else { + lenx = n; + leny = m; + } + + if (alpha == ZERO) { + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); + return; + } + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) {x -= (lenx - 1) * incx;} + if (incy < 0) {y -= (leny - 1) * incy;} + +#ifdef SMP + int thread_thres_row = 20480; + if (trans) { + if (n <= thread_thres_row) { + nthreads = 1; + } else { + nthreads = num_cpu_avail(1); + } + } else { + if (m <= thread_thres_row) { + nthreads = 1; + } else { + nthreads = num_cpu_avail(1); + } + } + + + if (nthreads == 1) { +#endif + (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy); +#ifdef SMP + } else { + (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads); + } +#endif + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + IDEBUG_END; + + return; +} diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 79399c342..888a9b959 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL XGEMVTKERNEL = zgemv_t.S endif +ifeq ($(BUILD_BFLOAT16),1) +ifndef SBGEMVNKERNEL +SBGEMVNKERNEL = ../x86_64/sbgemv_n.c +endif + +ifndef SBGEMVTKERNEL +SBGEMVTKERNEL = ../x86_64/sbgemv_t.c +endif +endif + ### GER ### ifndef SGERKERNEL @@ -234,6 +244,12 @@ XBLASOBJS += \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemv_n$(TSUFFIX).$(SUFFIX) \ + sbgemv_t$(TSUFFIX).$(SUFFIX) +endif + ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ @@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 849a4194a..d0317a745 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -69,7 +69,7 @@ gotoblas_t TABLE_NAME = { snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, - sgemv_nTS, sgemv_tTS, sger_kTS, + sbgemv_nTS, sbgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, sbgemm_kernelTS, sbgemm_betaTS, diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 855e1ff8c..b92f480e9 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -384,6 +384,14 @@ endif GEMVDEP = ../l2param.h +ifndef SBGEMVNKERNEL +SBGEMVNKERNEL = sbgemv_n.c +endif + +ifndef SBGEMVTKERNEL +SBGEMVTKERNEL = sbgemv_t.c +endif + ifndef SGEMVNKERNEL SGEMVNKERNEL = sgemv_n.c endif diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h new file mode 100644 index 000000000..1014ecc4d --- /dev/null +++ b/kernel/x86_64/bf16_common_macros.h @@ -0,0 +1,795 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#ifndef __BF16_COMMON_MACROS +#define __BF16_COMMON_MACROS + +#include + +#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \ + reg256##_0 = _mm512_castps512_ps256(reg512##_0); \ + reg256##_1 = _mm512_castps512_ps256(reg512##_1); + + +#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ + regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]); \ + regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]); \ + regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]); \ + regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); + +#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]); + +#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n) \ + reg = _mm512_loadu_si512(x + idx_n); + + +#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ + reg = _mm256_loadu_si256(x + idx_n); + + +#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ + reg = _mm_loadu_si128(x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ + reg = _mm512_maskz_loadu_epi16(mask, x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask) \ + reg = _mm256_maskz_loadu_epi16(mask, x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask) \ + reg = _mm_maskz_loadu_epi16(mask, x + idx_n); + + +/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27 + |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27 + |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31 + |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31 + |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27 + |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25 + |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31 + |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29 + |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31 +*/ +#define BF16_INTERLEAVE_8x32(regArray) \ + regArray##_8 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_9 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5); \ + regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7); \ + regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \ + regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5); \ + regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_8, regArray##_9); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_8, regArray##_9); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \ + regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \ + regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \ + regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \ + regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15); + + +/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11 + |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11 + |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15 + |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15 + |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11 + |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 + |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15 + |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13 + |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15 +*/ +#define BF16_INTERLEAVE_8x16(regArray) \ + regArray##_8 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_9 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5); \ + regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7); \ + regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \ + regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5); \ + regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_8, regArray##_9); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_8, regArray##_9); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_10, regArray##_11); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_10, regArray##_11); \ + regArray##_4 = _mm256_unpacklo_epi64(regArray##_12, regArray##_13); \ + regArray##_5 = _mm256_unpackhi_epi64(regArray##_12, regArray##_13); \ + regArray##_6 = _mm256_unpacklo_epi64(regArray##_14, regArray##_15); \ + regArray##_7 = _mm256_unpackhi_epi64(regArray##_14, regArray##_15); + +/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31 +*/ +#define BF16_INTERLEAVE_4x32(regArray) \ + regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7); + + +/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15 +*/ +#define BF16_INTERLEAVE_4x16(regArray) \ + regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7); + + +/* 2-step interleave for x with 32 BF16 elements + Input - original vector + Output - the output of Step 2 + + Step 1: 2-element interleave for x: + |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27 + |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31 + + Step 2: 4-element interleave for x: + |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25 + |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27 + |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29 + |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31 +*/ +#define BF16_INTERLEAVE_1x32(regArray) \ + regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0); \ + regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3); + + +/* 2-step interleave for x with 16 BF16 elements + Input - original vector + Output - the output of Step 2 + + Step 1: 2-element interleave for x: + |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11 + |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15 + + Step 2: 4-element interleave for x: + |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 + |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11 + |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13 + |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15 +*/ +#define BF16_INTERLEAVE_1x16(regArray) \ + regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0); \ + regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3); + +/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers + |a0|a1|...|a14|a15|i0|i1|...|i14|i15| + |b0|b1|...|b14|b15|j0|j1|...|j14|j15| + |c0|c1|...|c14|c15|k0|k1|...|k14|k15| + |d0|d1|...|d14|d15|l0|l1|...|l14|l15| + |e0|e1|...|e14|e15|m0|m1|...|m14|m15| + |f0|f1|...|f14|f15|n0|n1|...|n14|n15| + |g0|g1|...|g14|g15|o0|o1|...|o14|o15| + |h0|h1|...|h14|h15|p0|p1|...|p14|p15| +*/ +#define BF16_INTERLEAVE256_8x32(regArray) \ + regArray##_0 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0x44); \ + regArray##_1 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0xee); \ + regArray##_2 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0x44); \ + regArray##_3 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0xee); \ + regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44); \ + regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee); \ + regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44); \ + regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee); + + +/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers + |a0|a1|...|a14|a15|e0|e1|...|e14|e15| + |b0|b1|...|b14|b15|f0|f1|...|f14|f15| + |c0|c1|...|c14|c15|g0|g1|...|g14|g15| + |d0|d1|...|d14|d15|h0|h1|...|h14|h15| +*/ +#define BF16_INTERLEAVE256_4x32(regArray) \ + regArray##_0 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0x44); \ + regArray##_1 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0xee); \ + regArray##_2 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0x44); \ + regArray##_3 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0xee); + + +#define BF16_PERMUTE_8x32(idx, regArray) \ + regArray##_8 = _mm512_permutexvar_epi16(idx, regArray##_0); \ + regArray##_9 = _mm512_permutexvar_epi16(idx, regArray##_1); \ + regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2); \ + regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3); \ + regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4); \ + regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5); \ + regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6); \ + regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7); + + +#define BF16_PERMUTE_8x32_2(idx, regArray) \ + regArray##_8 = _mm512_permutexvar_epi32(idx, regArray##_0); \ + regArray##_9 = _mm512_permutexvar_epi32(idx, regArray##_1); \ + regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2); \ + regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3); \ + regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4); \ + regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5); \ + regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6); \ + regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7); + + +#define BF16_PERMUTE_4x32(idx, regArray) \ + regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0); \ + regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1); \ + regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2); \ + regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3); + + +#define BF16_PERMUTE_4x32_2(idx, regArray) \ + regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0); \ + regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1); \ + regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2); \ + regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3); + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3); + + +/* Calculate the dot result for matrix and vector at 32 elements per row + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_8x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray); \ + accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray); \ + accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray); \ + accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray); \ + accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray); \ + accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray); \ + accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray); + +/* Calculate the dot result for matrix and vector at 32 elements per row + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_1x32(accumArray, matArray, xArray) \ + accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray); + +/* Calculate the dot result for matrix and vector at 16 elements per row + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_8x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray); \ + accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray); \ + accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray); \ + accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray); \ + accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray); \ + accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray); \ + accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray); + + +/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13| + |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13| + |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13| + |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13| + |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15| + |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15| + |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15| + |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15| + + Step 2: 4-element interleave for matrix + |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12| + |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13| + |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12| + |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13| + |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14| + |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15| + |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14| + |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15| +*/ +#define FP32_INTERLEAVE_8x16(regArray) \ + regArray##_8 = _mm512_unpacklo_ps(regArray##_0, regArray##_1); \ + regArray##_9 = _mm512_unpacklo_ps(regArray##_2, regArray##_3); \ + regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5); \ + regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7); \ + regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1); \ + regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3); \ + regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5); \ + regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7); \ + \ + regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \ + regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \ + regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \ + regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \ + regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \ + regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \ + regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \ + regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15); + +#define FP32_INTERLEAVE_8x16_ARRAY(regArray) \ + regArray[8] = _mm512_unpacklo_ps(regArray[0], regArray[1]); \ + regArray[9] = _mm512_unpacklo_ps(regArray[2], regArray[3]); \ + regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]); \ + regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]); \ + regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]); \ + regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]); \ + regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]); \ + regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]); \ + \ + regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8], (__m512d) regArray[9]); \ + regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8], (__m512d) regArray[9]); \ + regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \ + regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \ + regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \ + regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \ + regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \ + regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]); + +/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|b0|a1|b1|a4|b4|a5|b5| + |c0|d0|c1|d1|c4|d4|c5|d5| + |e0|f0|e1|f1|e4|f4|e5|f5| + |g0|h0|g1|h1|g4|h4|g5|h5| + |a2|b2|a3|b3|a6|b6|a7|b7| + |c2|d2|c3|d3|c6|d6|c7|d7| + |e2|f2|e3|f3|e6|f6|e7|f7| + |g2|h2|g3|h3|g6|h6|g7|h7| + + Step 2: 4-element interleave for matrix + |a0|b0|c0|d0|a4|b4|c4|d4| + |a1|b1|c1|d1|a5|b5|c5|d5| + |e0|f0|g0|h0|e4|f4|g4|h4| + |e1|f1|g1|h1|e5|f5|g5|h5| + |a2|b2|c2|d2|a6|b6|c6|d6| + |a3|b3|c3|d3|a7|b7|c7|d7| + |e2|f2|g2|h2|e6|f6|g6|h6| + |e3|f3|g3|h3|e7|f7|g7|h7| +*/ +#define FP32_INTERLEAVE_8x8(regArray) \ + regArray##_8 = _mm256_unpacklo_ps(regArray##_0, regArray##_1); \ + regArray##_9 = _mm256_unpacklo_ps(regArray##_2, regArray##_3); \ + regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5); \ + regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7); \ + regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1); \ + regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3); \ + regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5); \ + regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7); \ + \ + regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \ + regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \ + regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \ + regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \ + regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \ + regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \ + regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \ + regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15); + + +/* Accumulate the result for 2 batch of 4-registers +*/ +#define FP32_ACCUM2_8x16(regArray) \ + regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1); \ + regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3); \ + regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5); \ + regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7); \ + regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2); \ + regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6); + +#define FP32_ACCUM2_8x16_ARRAY(regArray) \ + regArray[0] = _mm512_add_ps(regArray[0], regArray[1]); \ + regArray[2] = _mm512_add_ps(regArray[2], regArray[3]); \ + regArray[4] = _mm512_add_ps(regArray[4], regArray[5]); \ + regArray[6] = _mm512_add_ps(regArray[6], regArray[7]); \ + regArray[0] = _mm512_add_ps(regArray[0], regArray[2]); \ + regArray[4] = _mm512_add_ps(regArray[4], regArray[6]); + +/* Accumulate the result for 2 batch of 4-registers +*/ +#define FP32_ACCUM2_8x8(regArray) \ + regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1); \ + regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3); \ + regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5); \ + regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7); \ + regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2); \ + regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6); + + +/* Store 16 (alpha * result + beta * y) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr))); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (alpha * result + beta * y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr))); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (alpha * result + beta * y) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr))); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (alpha * result + beta * y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr))); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (alpha * result + beta * y) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr))); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (alpha * result + beta * y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr))); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 16 (alpha * result + y) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr)); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (alpha * result + y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (alpha * result + y) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr)); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (alpha * result + y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (alpha * result + y) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr)); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (alpha * result + y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 16 (alpha * result) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult)); + + +/* Masked store 16 (alpha * result) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult)); + + +/* Store 8 (alpha * result) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult)); + + +/* Masked store 8 (alpha * result) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult)); + + +/* Store 4 (alpha * result) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult)); + + +/* Masked store 4 (alpha * result) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult)); + + +/* Store 16 result to y +*/ +#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 result to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 result to y +*/ +#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 result to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 result to y +*/ +#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 result to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + +#endif diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c new file mode 100644 index 000000000..18e64dc3f --- /dev/null +++ b/kernel/x86_64/sbgemv_n.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined (COOPERLAKE) +#include "sbgemv_n_microk_cooperlake.c" +#endif + +#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ + ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ + ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr + +#define ALIGN64_FREE(ptr) \ + free(ptr) + +#ifndef HAVE_SBGEMV_N_ACCL_KERNEL +static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + BLASLONG offset_lda, offset_m; + float accum = 0.0; + float tmp_x = 0.0; + + bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n); + float * a_fp32 = malloc(sizeof(float)*m*n); + float * x_fp32 = malloc(sizeof(float)*n); + + for (BLASLONG j=0; j= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SBGEMV_N_ACCL_KERNEL 1 +#include "common.h" +#include + +// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios +#undef ZERO_BETA +#undef ONE_BETA +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios +#undef ZERO_BETA +#define ONE_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#define ONE_ALPHA 1 +#include "sbgemv_n_microk_cooperlake_template.c" + +static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data + if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA + sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y); + } else { // ALPHA != 1.0, need to multipy ALPHA + sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } + } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is + if (beta == ONE) { + sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c new file mode 100644 index 000000000..46e6d0ff9 --- /dev/null +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -0,0 +1,234 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +// Include common macros for BF16 based operations with IA intrinsics +#include "bf16_common_macros.h" + +#ifndef ZERO_BETA // Beta is non-zero + +#ifndef ONE_BETA // BETA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA + +#else // BETA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE + +#endif + +#else // BETA is zero + +#ifndef ONE_ALPHA // ALPHA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA + +#else // ALPHA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT + +#endif + +#endif + + + +// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_128x = m & (~127); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512i xArray_0; + + __m512i ZERO512 = _mm512_setzero_si512(); + + unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa); + __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value); + unsigned int blend_lo_mask_value = ((unsigned int)0x55555555); + __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value); + + __m512i M512_EPI32_8 = _mm512_set1_epi32(8); + __m512i idx_base_0 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_8); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1); + matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1); + matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2); + matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2); + matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3); + matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0) + BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0) + BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0) + BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0) + BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0) + BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3); + accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3); + accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5); + accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5); + accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7); + accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7); + + STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0) + STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16) + STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32) + STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48) + STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64) + STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80) + STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96) + STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112) + } + + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + + STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0) + STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16) + } + + if (tag_m_32x != m) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); + + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + + if ((m-tag_m_32x) > 16) { + STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) + STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) + } else { + STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask) + } + } + + return 0; +} diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c new file mode 100644 index 000000000..22b099116 --- /dev/null +++ b/kernel/x86_64/sbgemv_t.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined (COOPERLAKE) +#include "sbgemv_t_microk_cooperlake.c" +#endif + +#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ + ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ + ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr + +#define ALIGN64_FREE(ptr) \ + free(ptr) + +#ifndef HAVE_SBGEMV_T_ACCL_KERNEL +static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + BLASLONG offset_lda, offset_n; + float accum = 0.0; + + bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n); + float * a_fp32 = malloc(sizeof(float)*m*n); + float * x_fp32 = malloc(sizeof(float)*n); + + for (BLASLONG i=0; i= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SBGEMV_T_ACCL_KERNEL 1 + +// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios +#undef ZERO_BETA +#undef ONE_BETA +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios +#undef ZERO_BETA +#define ONE_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#define ONE_ALPHA 1 +#include "sbgemv_t_microk_cooperlake_template.c" + +static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data + if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA + if (n > 127) { + sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1 (m, alpha, a, x, y); break; + case 2: sbgemv_kernel_32x2 (m, alpha, a, x, y); break; + case 3: sbgemv_kernel_32x3 (m, alpha, a, x, y); break; + case 4: sbgemv_kernel_16x4 (m, alpha, a, x, y); break; + case 5: sbgemv_kernel_30x5 (m, alpha, a, x, y); break; + case 6: sbgemv_kernel_16x6 (m, alpha, a, x, y); break; + case 7: sbgemv_kernel_16x7 (m, alpha, a, x, y); break; + case 8: sbgemv_kernel_16x8 (m, alpha, a, x, y); break; + case 9: sbgemv_kernel_14x9 (m, alpha, a, x, y); break; + case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break; + case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break; + case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break; + case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break; + case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break; + case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break; + case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y); + } + } + } + } else { // ALPHA != 1.0, need to multipy ALPHA + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break; + case 2: sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break; + case 3: sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break; + case 4: sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break; + case 5: sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break; + case 6: sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break; + case 7: sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break; + case 8: sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break; + case 9: sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break; + case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break; + case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break; + case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break; + case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break; + case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break; + case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break; + case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y); + } + } + } + } + } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is + if (beta == ONE) { + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break; + case 2: sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break; + case 3: sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break; + case 4: sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break; + case 5: sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break; + case 6: sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break; + case 7: sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break; + case 8: sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break; + case 9: sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break; + case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break; + case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break; + case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break; + case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break; + case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break; + case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break; + case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y); + } + } + } + } else { + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break; + case 2: sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break; + case 3: sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break; + case 4: sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break; + case 5: sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break; + case 6: sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break; + case 7: sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break; + case 8: sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break; + case 9: sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break; + case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break; + case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break; + case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break; + case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break; + case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break; + case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break; + case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } + } + } + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c new file mode 100644 index 000000000..51e681add --- /dev/null +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -0,0 +1,3082 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" +// Include common macros for BF16 based operations with IA intrinsics +#include "bf16_common_macros.h" + +#ifndef ZERO_BETA // Beta is non-zero + +#ifndef ONE_BETA // BETA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA + +#else // BETA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE + +#endif + +#else // BETA is zero + +#ifndef ONE_ALPHA // ALPHA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA + +#else // ALPHA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT + +#endif + +#endif + + +// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2; + __m512i xArray; + __m512 result_0, result_1; +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA +#ifndef ONE_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif +#endif + + __m512i load_idx_lo = _mm512_set_epi16(0, 15, 0, 14, 0, 13, 0, 12, 0, 11, 0, 10, 0, 9, 0, 8,\ + 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0); + __m512i M512_EPI16_16 = _mm512_set1_epi16(16); + __m512i load_idx_hi = _mm512_add_epi16(load_idx_lo, M512_EPI16_16); + + unsigned int interleve_mask_value = ((unsigned int) 0x55555555); + __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value); + + xArray = _mm512_set1_epi16((short) x[0]); + xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray); + + if (tag_m_32x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]); // Load 32 rows with n=1 + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + BLASLONG tail_num = m - tag_m_32x; + if (tail_num > 16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num)); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 32 rows with n=1 + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num)); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x) + STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask) + } else if (tail_num > 8) { + __m256 result256_0 = _mm256_setzero_ps(); + __m256 result256_1 = _mm256_setzero_ps(); + + __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo); + __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1); + __m256i xArray256 = _mm512_castsi512_si256(xArray); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 16 rows with n=1 + __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0); // Expand the low 8 elements + __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0); // Expand the high 8 elements + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256); + + unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x) + STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask) + } else { + __m128 result128_0 = _mm_setzero_ps(); + __m128 result128_1 = _mm_setzero_ps(); + + __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0); + __m128i M128_EPI16_4 = _mm_set1_epi16(4); + __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4); + + __m128i xArray128 = _mm512_castsi512_si128(xArray); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 8 rows with n=1 + __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0); // Expand the low 4 elements + __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0); // Expand the high 4 elements + + result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128); + result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128); + + if (tail_num > 4) { + unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x) + STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask) + } else { + unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask) + } + } + + return 0; +} + +// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512i matrixArray_0, matrixArray_1; + __m512i xArray; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x)); + + if (tag_m_32x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]); // Load 16 rows as n=2 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]); // Load 16 rows as n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + if (m - tag_m_32x >= 16) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]); // Load 16 rows with n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x) + + tag_m_32x += 16; + } + + BLASLONG tail_num = m - tag_m_32x; + if (tail_num > 8) { + result_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 16 rows with n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask) + } else if (tail_num == 8) { + __m256 result256 = _mm256_setzero_ps(); + + __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i xArray256 = _mm512_castsi512_si256(xArray); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_32x) + } else { + __m256 result256 = _mm256_setzero_ps(); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i xArray256 = _mm512_castsi512_si256(xArray); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); + + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask) + } + + return 0; +} + +// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0| + __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // x2| 0|x2| 0|...|x2| 0| + + __m512i load_idx_base; + __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16; + M512_EPI16_2 = _mm512_set1_epi16(2); + M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2); + M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8); + M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8); + load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 0); + + if (tag_m_32x > 0) { + __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd; + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6; + + unsigned int idx_blend_mask_value = ((unsigned int)0x80000000); + __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value); + + load_idx01_1st = load_idx_base; + load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16); + load_idx2_1st = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2); + load_idx2_2nd = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2); + load_idx2_2nd = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512()); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]); // Load 10 rows with n=3 plus 2 element + matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]); // Load 10 rows with n=3 plus 2 element + matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]); // Load 10 rows with n=3 plus 2 element + + matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1); // Select the first 2 elements for each row + matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2); // Select the first 2 elements for each row + matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st, matrixArray_1); // Select the third element for each row + matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd, matrixArray_2); // Select the third element for each row + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + if (tag_m_32x != m) { + __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd; + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6; + __m256 result256_0, result256_1; + + unsigned short idx256_blend_mask_value = ((unsigned short)0x8000); + __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value); + + load256_idx01_1st = _mm512_castsi512_si256(load_idx_base); + load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8)); + load256_idx2_1st = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2)); + load256_idx2_2nd = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2)); + load256_idx2_2nd = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256()); + + if (m - tag_m_32x > 15) { + result256_0 = _mm256_setzero_ps(); + result256_1 = _mm256_setzero_ps(); + + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x) + STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8) + + tag_m_32x += 16; + } + + if (tag_m_32x != m) { + result256_0 = _mm256_setzero_ps(); + result256_1 = _mm256_setzero_ps(); + BLASLONG tail_num = m-tag_m_32x; + + if (tail_num > 10) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } else if (tail_num > 5) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows + matrixArray256_2 = _mm256_setzero_si256(); + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } else { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]); // Load m-tag_m_32x rows + matrixArray256_1 = _mm256_setzero_si256(); + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } + + unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num))); + __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value); + __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1); + STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask) + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i xArray_01, xArray_23, xArray_remix; + __m512 result; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1); + + unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x); // |x0|x1|x2|x3|0|0|0|0| + xArray_01 = _mm512_broadcastd_epi32(xTmp); // |x0|x1|x0|x1|...|x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // |x2|x3|x2|x3|...|x2|x3| + unsigned short blend_mask_value = ((unsigned short)0xff00); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3| + + if (tag_m_16x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]); // Load 8 rows with n=4 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]); // Load 8 rows with n=4 + + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1); // |a0|a1|...|h0|h1|i0|i1|...|p0|p1| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1); // |a2|a3|...|h2|h3|i2|i3|...|p2|p3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01); + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23); + + STORE16_COMPLETE_RESULT(result, y+idx_m) + } + } + + if (m - tag_m_16x > 7) { + result = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]); // Load 8 rows with n=4 + matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + BLASLONG tail_num = m-tag_m_16x; + if (tail_num != 0) { + result = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]); // Load 8 rows with n=4 + matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1)); + + unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask) + } + + return 0; +} + +// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_30x = m - (m%30); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|0|0|0| + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512 result_0, result_1; + __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3| + __m512i xArray_4 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4| 0|x4| 0|...|x4| 0| + + __m512i M512_EPI16_2 = _mm512_set1_epi16(2); + __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0, 58, 57, 53, 52, 48, 47, 43, 42, + 38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10, 6, 5, 1, 0); + __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39); + __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f); + + __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2); + __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2); + __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2); + + __m512i load_idx4_stage1_1st = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2); + __m512i load_idx4_stage1_2nd = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2); + __m512i load_idx4_stage1_3rd = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4; + __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2; + __m512i matrixArray_stage2_0, matrixArray_stage2_1; + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + if (tag_m_30x > 0) { + unsigned short blend_mask_value_0 = ((unsigned short)0xf000); + __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0); + unsigned short blend_mask_value_1 = ((unsigned short)0x3f00); + __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1); + for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5 + matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]); // Load 6 rows with n=5 + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]); // Load 6 rows with n=5 + matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]); // Load 6 rows with n=5 + + // Process the 0|1 elements + // Stage 1: Select the 0|1 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 0|1 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01); + + // Process the 2|3 elements + // Stage 1: Select the 2|3 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 2|3 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23); + + // Process the for 4 elements + // Stage 1: Select the 4 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 4 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_4); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_4); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask) + } + } + + if (m - tag_m_30x > 11) { + BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12); + for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) { + unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value); + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5 + + // Interleave the elements + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + // Calculate and accumulate the result + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask) + tag_m_30x += 12; + } + } + + BLASLONG tail_num = m - tag_m_30x; + if (tail_num > 6) { + unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num))); + __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value); + unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5)); + __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value); + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]); // Load x rows with n=5 + + // Interleave the elements + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + // Calculate and accumulate the result + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask) + } else { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_30x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]); // Load 1 rows with n=5 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|x5|0|0| + + if (tag_m_16x > 0) { + __m512 result_0; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i load_idx01_1st = _mm512_set_epi32( 0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i load_idx01_2nd = _mm512_set_epi32(13, 10, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1); + __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1); + + __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1); + __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1); + + unsigned short blend_mask_value = ((unsigned short)0x0400); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register + load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd); + // Set the 11th element to be 0 as 0 is the correct index + load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd); + + __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3| + __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4|x5|x4|x5|...|x4|x5| + + unsigned short permute_mask01_uint = (((unsigned short)0xf800)); + __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint); + unsigned short permute_mask45_uint = (((unsigned short)0xfc00)); + __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2; + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]); // Load 5 rows with n=6 plus 2 element + matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]); // Load 5 rows with n=6 plus 2 element + matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]); // Load 5 rows with n=6 plus 2 element + + // Stage 1: interleave for the a..k elements + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1); + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1); + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1); + + // Stage 2: interleave for the l..p elements and remix together + matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2); + matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2); + matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2); + + // Calculate the result of the 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1); + __m256i load_idx01_1st = _mm256_set_epi32( 0, 0, 15, 12, 9, 6, 3, 0); + __m256i load_idx01_2nd = _mm256_set_epi32( 5, 2, 0, 0, 0, 0, 0, 0); + + __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1); + __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1); + unsigned char blend_mask_value = ((unsigned char)0x20); + __mmask8 blend_mask = *((__mmask8*) &blend_mask_value); + // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register + load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd); + // Set the 6th element to be 0 as 0 is the correct index + load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd); + + __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1); + __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1); + + unsigned char permute_mask01_uint = (((unsigned char)0xc0)); + __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint); + unsigned char permute_mask45_uint = (((unsigned char)0xe0)); + __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint); + + __m256i matrixArray_0, matrixArray_1, matrixArray_2; + __m256i matrixArray_stage_0; + __m256 result256_0; + + result256_0 = _mm256_setzero_ps(); + + matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + + // Process the 0|1 elements + // Select the 0|1 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01)); + + // Process the 2|3 elements + // Select the 2|3 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23)); + + // Process the for 4 elements + // Select the 4|5 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45)); + + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x) + tag_m_16x += 8; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]); // Load 1 rows with n=6 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_0123, xArray_4567; + __m512 result_0, result_1, result_2, result_3; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_2 = _mm512_set1_epi32(2); + __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14, + 31, 13, 12, 11, 10, 9, 8, 7, 31, 6, 5, 4, 3, 2, 1, 0); + __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2); + + unsigned short x_blend_mask_value = ((unsigned short)0xff00); + __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value); + xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1))); + xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3))); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]); // Load 4 rows with n=7 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]); // Load 4 rows with n=7 + matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]); // Load 4 rows with n=7 + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2); // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7| + matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3); // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567); + + // Stage 3: interleave per 256 bits + result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44); + result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee); + + result_2 = _mm512_add_ps(result_2, result_3); + + STORE16_COMPLETE_RESULT(result_2, y+idx_m) + } + + if (m - tag_m_16x > 7) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + + tag_m_16x += 8; + } + + BLASLONG tail_num = m - tag_m_16x; + if (tail_num > 3) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7 + unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7)); + __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value); + matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask) + tag_m_16x = m; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]); // Load 1 rows with n=7 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_0123, xArray_4567; + __m512 result_0, result_1, result_2, result_3; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_2 = _mm512_set1_epi32(2); + __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2); + + unsigned short x_blend_mask_value = ((unsigned short)0xff00); + __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value); + xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1))); + xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3))); + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]); // Load 4 rows with n=8 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]); // Load 4 rows with n=8 + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]); // Load 4 rows with n=8 + matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567); + + // Stage 2: interleave per 256 bits + result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44); + result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee); + + result_2 = _mm512_add_ps(result_2, result_3); + + STORE16_COMPLETE_RESULT(result_2, y+idx_m) + } + + if (m - tag_m_16x > 7) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8 + matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + BLASLONG tail_num = m - tag_m_16x; + if (tail_num > 3) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8 + unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4)); + __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value); + matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask) + tag_m_16x = m; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_14x = m - (m%14); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| + + if (tag_m_14x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m256i M256_EPI16_2 = _mm256_set1_epi16(2); + __m256i idx_base_0 = _mm256_set_epi16( 0, 0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10, 9, 1, 0); + __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2); + __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2); + __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2); + __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2); + __m512i idx_idx = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0); + + __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1)); + __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3)); + __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0)); + __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2)); + __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4)); + __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10, 9, 8, 7); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3| + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5| + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7| + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|0 |x8| 0| ... |x8| 0| + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + unsigned short blend_mask_value = ((unsigned short)0x3f80); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]); // Load 3 rows with n=9 plus 5 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]); // Load 3 rows with n=9 plus 4 elements + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]); // Load 3 rows with n=9 plus 5 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]); // Load 3 rows with n=9 plus 4 elements + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x| + matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x| + matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x| + matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x| + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x| + matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x| + matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5); // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_14x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_14x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_12x = m - (m%12); + + unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| + + if (tag_m_12x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_base_0 = _mm256_set_epi32( 0, 0, 26, 21, 16, 10, 5, 0); + __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1); + __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1); + __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1); + __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1); + __m512i idx_idx = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0); + + __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1)); + __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3)); + __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0)); + __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2)); + __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4)); + __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 11, 10, 9, 8, 7, 6); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3| + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5| + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7| + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|x9|x8|x9| ... |x8|x9| + + unsigned short blend_mask_value = ((unsigned short)0x0fc0); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + unsigned short load_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 load_mask = *((__mmask16*) &load_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]); // Load 3 rows with n=10 + matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]); // Load 3 rows with n=10 + matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]); // Load 3 rows with n=10 + matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]); // Load 3 rows with n=10 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x| + matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x| + matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x| + + // Stage 3: interleave per 256 bits + matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5); // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_12x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned char load256_mask_value = (((unsigned char)0xff) >> 3); + __mmask8 load256_mask = *((__mmask8*) &load256_mask_value); + for (BLASLONG i = tag_m_12x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_15x = m - (m%15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| + + if (tag_m_15x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; + __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3; + + __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5; + M512_EPI16_2 = _mm512_set1_epi16(2); + M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2); + M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2); + M512_EPI32_5 = _mm512_set1_epi32(5); + + unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff); + __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value); + unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00); + __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value); + unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000); + __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value); + + idx_stage1_base_0 = _mm512_set_epi16( 0, 0, 49, 48, 38, 37, 27, 26, 16, 15, 5, 4, 47, 46, 36, 35, + 25, 24, 14, 13, 3, 2, 45, 44, 34, 33, 23, 22, 12, 11, 1, 0); + idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6); + + idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2); + idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2); + idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6); + + idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2); + idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2); + idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4); + idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6); + + unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0); + __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value); + unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00); + __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value); + idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5); + idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5); + idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5); + idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 | + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 | + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 | + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 | + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 | + xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|0 |x10|0 | ... |x10|0 | + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]); // Load 2 rows with n=11 plus 1 element + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]); // Load 2 rows with n=11 plus 1 element + matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]); // Load 2 rows with n=11 plus 1 element + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5| + matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x | + matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5| + matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x | + matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1| + matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7| + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x| + + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_15x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_15x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_15x = m - (m%15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| + + if (tag_m_15x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; + __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3; + + __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5; + M512_EPI32_1 = _mm512_set1_epi32(1); + M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1); + M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1); + M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2); + + unsigned short BASE_MASK_10_value = ((unsigned short)0x001f); + __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value); + unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0); + __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value); + unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00); + __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value); + + idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14, 8, 2, 25, 19, 13, 7, 1, 24, 18, 12, 6, 0); + idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3); + + idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1); + idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1); + idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3); + + idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1); + idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1); + idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2); + idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3); + + unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0); + __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value); + unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00); + __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value); + idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5); + idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5); + idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5); + idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 | + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 | + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 | + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 | + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 | + xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|x11|x10|x11| ... |x10|x11| + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]); // Load 2 rows with n=12 plus 4 element + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]); // Load 2 rows with n=12 plus 4 element + matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]); // Load 2 rows with n=12 plus 4 element + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 | + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 | + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11| + matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 | + matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 | + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x| + + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_15x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_15x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + + +// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask) + + matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44); + matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee); + matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44); + matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]); // Load 1 rows with n=13 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + __m512i shift_idx = _mm512_set_epi32(0, 13, 12, 11, 10, 9, 8, 7, 0, 6, 5, 4, 3, 2, 1, 0); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask) + + // Pre-stage: shift the 2nd vector 1 position right for each register + BF16_PERMUTE_8x32_2(shift_idx, matrixArray) + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask) + + // Pre-stage: shift the 2nd vector 1 position right for each register + BF16_PERMUTE_4x32_2(shift_idx, matrixArray) + + // interleave per 256 bits + BF16_INTERLEAVE256_4x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]); // Load 1 rows with n=14 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask) + + matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44); + matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee); + matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44); + matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]); // Load 1 rows with n=15 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + matrixArray_8 = _mm512_loadu_si512(&a[(idx_m )*16]); // Load 2 rows with n=16 + matrixArray_9 = _mm512_loadu_si512(&a[(idx_m+2 )*16]); // Load 2 rows with n=16 + matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]); // Load 2 rows with n=16 + matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]); // Load 2 rows with n=16 + matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]); // Load 2 rows with n=16 + matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]); // Load 2 rows with n=16 + matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]); // Load 2 rows with n=16 + matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]); // Load 2 rows with n=16 + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16 + matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16 + matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]); // Load 2 rows with n=16 + matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]); // Load 2 rows with n=16 + + // interleave per 256 bits + BF16_INTERLEAVE256_4x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, \ + matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16 + matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16 + + matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0); + matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1); + matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1); + matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n)); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|... + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512 accum512_0, accum512_1, accum512_2, accum512_3; + __m256 accum256; + __m128 accum128; + + if (tag_m_8x > 0) { + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + // Prepare X with 2-step interleave way + xArray_0 = x512; + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load 8 rows from matrix + BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..h[0:31] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + accum512_2 = _mm512_add_ps(accum512_2, accum512_3); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1)); + STORE8_COMPLETE_RESULT(accum256, y+idx_m) + } + + if (m - tag_m_8x > 3) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load 4 rows from matrix + BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..d[0:31] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x) + tag_m_8x += 4; + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum512_0 = _mm512_setzero_ps(); + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16 + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_n_32x = n & (~31); + BLASLONG tag_n_128x = n & (~127); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + __m512 accum512_bridge[8]; + __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; + __m256 accum256_0; + __m128 accum128; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + if (tag_m_8x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + for (int j = idx_m; j < idx_m + 8; j++) { + accum512_t_0 = _mm512_setzero_ps(); + accum512_t_1 = _mm512_setzero_ps(); + accum512_t_2 = _mm512_setzero_ps(); + accum512_t_3 = _mm512_setzero_ps(); + /* Processing the main chunk with 128-elements per round */ + for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96) + + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0) + BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32) + BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64) + BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96) + + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1) + BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2) + BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3) + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n) + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask) + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0) + } + + /* Accumulate the 4 registers into 1 register */ + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1); + accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3); + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2); + + // Temply save the result into a ZMM + accum512_bridge[j-idx_m] = accum512_t_0; + } + + FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge) + FP32_ACCUM2_8x16_ARRAY(accum512_bridge) + accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]); + accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]); + accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]); + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1)); + STORE8_COMPLETE_RESULT(accum256_0, y+idx_m) + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG j = tag_m_8x; j < m; j++) { + accum512_t_0 = _mm512_setzero_ps(); + accum512_t_1 = _mm512_setzero_ps(); + accum512_t_2 = _mm512_setzero_ps(); + accum512_t_3 = _mm512_setzero_ps(); + /* Processing the main chunk with 128-elements per round */ + for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96) + + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0) + BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32) + BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64) + BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96) + + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1) + BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2) + BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3) + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n) + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask) + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0) + } + + /* Accumulate the 4 registers into 1 register */ + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1); + accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3); + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2); + + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[j] = alpha * accum128[0] + beta * y[j]; +#else + y[j] = alpha * accum128[0] + y[j]; +#endif +#else +#ifndef ONE_ALPHA + y[j] = accum128[0] * alpha; +#else + y[j] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_n_32x = n & (~31); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + __m256 accum256_0; + __m128 accum128; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512i xArray_0; + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + if (tag_m_8x > 0) { + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) { + // Load 8 rows from matrix + BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n) + + // Load x + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + + // Calculate the temp result for a..h[0:31] + BF16_DOT_8x32(accum512, matrixArray, xArray_0) + } + + if (tag_n_32x != n) { // Go with masked 512 + // Load 8 rows from matrix + BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask) + + // Load x + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + + // Calculate the temp result for a..h[0:31] + BF16_DOT_8x32(accum512, matrixArray, xArray_0) + } + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x16(accum512) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x16(accum512) + + accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4); + accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4); + accum512_1 = _mm512_add_ps(accum512_1, accum512_2); + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1)); + STORE8_COMPLETE_RESULT(accum256_0, y+idx_m) + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum512_0 = _mm512_setzero_ps(); + for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) { + // Load 32 elements from matrix + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n) + + // Load 32 elements from x + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + + // Calculate and accumulate the temp result + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + } + + if (tag_n_32x != n) { + // Load tail elements from matrix + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask) + + // Load 32 elements from x + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + + // Calculate and accumulate the temp result + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + } + + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + + __m256i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m256i xArray256; + + // Keep align with other kernels and macro definition, the high 256bit is never used +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif + + __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ + accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15; + + __m256i M256_EPI32_4 = _mm256_set1_epi32(4); + __m256i idx_base_0 = _mm256_set_epi32(11, 10, 9, 8, 3, 2, 1, 0); + __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_4); + + unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n)); + __mmask16 load_mask = *((__mmask16*) &load_mask_value); + + if (n == 16) { + BF16_VECTOR_LOAD_1x16(xArray256, x, 0) + } else { + BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask) + } + + if (n == 16) { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + accum256_2 = _mm256_setzero_ps(); + accum256_3 = _mm256_setzero_ps(); + accum256_4 = _mm256_setzero_ps(); + accum256_5 = _mm256_setzero_ps(); + accum256_6 = _mm256_setzero_ps(); + accum256_7 = _mm256_setzero_ps(); + + BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0) + + BF16_DOT_8x16(accum256, matrixArray, xArray256) + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x8(accum256) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x8(accum256) + + accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4); + accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4); + accum256_1 = _mm256_add_ps(accum256_1, accum256_2); + + STORE8_COMPLETE_RESULT(accum256_1, y+idx_m) + } + + if (tag_m_8x != m) { + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum256_0 = _mm256_setzero_ps(); + matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); + y[i] += accum128[0] * alpha; + } + } + } else { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + accum256_2 = _mm256_setzero_ps(); + accum256_3 = _mm256_setzero_ps(); + accum256_4 = _mm256_setzero_ps(); + accum256_5 = _mm256_setzero_ps(); + accum256_6 = _mm256_setzero_ps(); + accum256_7 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask) + + BF16_DOT_8x16(accum256, matrixArray, xArray256) + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x8(accum256) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x8(accum256) + + accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4); + accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4); + accum256_1 = _mm256_add_ps(accum256_1, accum256_2); + + STORE8_COMPLETE_RESULT(accum256_1, y+idx_m) + } + + if (tag_m_8x != m) { + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum256_0 = _mm256_setzero_ps(); + matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16 + accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + } + + return 0; +} From c5e62dad69ca13d48c2e9ce29a6398668e687dc9 Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 29 Oct 2020 03:37:51 +0800 Subject: [PATCH 022/121] Fix cooperlake compile issue Add a missing macro which is required in Makefile.x86_64 due to recent clearnup, which causes cooperlake platform build failure. --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 6d985786d..52d3e2cdc 100644 --- a/Makefile.system +++ b/Makefile.system @@ -319,6 +319,7 @@ ifeq ($(GCCVERSIONGTEQ7),1) else GCCDUMPVERSION_PARAM := -dumpversion endif +GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif From b43549188525741f311d6e5574c0fd960f964204 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 29 Oct 2020 14:57:51 -0500 Subject: [PATCH 023/121] Optimize caxpy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/KERNEL.POWER10 | 6 +- kernel/power/caxpy_microk_power10.c | 188 ++++++++++++++++++++++++++++ kernel/power/caxpy_power10.c | 126 +++++++++++++++++++ 3 files changed, 315 insertions(+), 5 deletions(-) create mode 100644 kernel/power/caxpy_microk_power10.c create mode 100644 kernel/power/caxpy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 1e514fcc9..b4c7a5e41 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -143,11 +143,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy_power10.c DAXPYKERNEL = daxpy_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CAXPYKERNEL = caxpy_power9.S -else -CAXPYKERNEL = caxpy.c -endif +CAXPYKERNEL = caxpy_power10.c ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy_power10.c diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c new file mode 100644 index 000000000..0d13416b3 --- /dev/null +++ b/kernel/power/caxpy_microk_power10.c @@ -0,0 +1,188 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8 (long n, float *x, float *y, + float alpha_r, float alpha_i) +{ +#if !defined(CONJ) + static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + const float *mvecp = mvec; + /* We have to load reverse mask for big endian. */ + /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ + + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + long ytmp; + + __asm__ + ( + "xscvdpspn 32, %7 \n\t" + "xscvdpspn 33, %8 \n\t" + "xxspltw 32, 32, 0 \n\t" + "xxspltw 33, 33, 0 \n\t" + "lxvd2x 36, 0, %9 \n\t" // mvec + +#if !defined(CONJ) + "xvmulsp 33, 33, 36 \n\t" // alpha_i * mvec +#else + "xvmulsp 32, 32, 36 \n\t" // alpha_r * mvec +#endif + "mr %4, %3 \n\t" + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 40, 0(%2) \n\t" // x0 + "lxvp 42, 32(%2) \n\t" // x2 + "lxvp 48, 0(%3) \n\t" // y0 + "lxvp 50, 32(%3) \n\t" // y2 + + "xxperm 52, 40, %x10 \n\t" // exchange real and imag part + "xxperm 53, 41, %x10 \n\t" // exchange real and imag part + "xxperm 54, 42, %x10 \n\t" // exchange real and imag part + "xxperm 55, 43, %x10 \n\t" // exchange real and imag part + + "lxvp 44, 64(%2) \n\t" // x4 + "lxvp 46, 96(%2) \n\t" // x6 + "lxvp 34, 64(%3) \n\t" // y4 + "lxvp 38, 96(%3) \n\t" // y6 + + "xxperm 56, 44, %x10 \n\t" // exchange real and imag part + "xxperm 57, 45, %x10 \n\t" // exchange real and imag part + "xxperm 58, 46, %x10 \n\t" // exchange real and imag part + "xxperm 59, 47, %x10 \n\t" // exchange real and imag part + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddasp 49, 41, 32 \n\t" + "lxvp 40, 0(%2) \n\t" // x0 + "xvmaddasp 50, 42, 32 \n\t" + "xvmaddasp 51, 43, 32 \n\t" + "lxvp 42, 32(%2) \n\t" // x2 + + "xvmaddasp 34, 44, 32 \n\t" + "xvmaddasp 35, 45, 32 \n\t" + "lxvp 44, 64(%2) \n\t" // x4 + "xvmaddasp 38, 46, 32 \n\t" + "xvmaddasp 39, 47, 32 \n\t" + "lxvp 46, 96(%2) \n\t" // x6 + + "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 128 \n\t" + "xvmaddasp 49, 53, 33 \n\t" + "xvmaddasp 50, 54, 33 \n\t" + "xvmaddasp 51, 55, 33 \n\t" + + "xvmaddasp 34, 56, 33 \n\t" + "xvmaddasp 35, 57, 33 \n\t" + "xvmaddasp 38, 58, 33 \n\t" + "xvmaddasp 39, 59, 33 \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 34, 64(%4) \n\t" + "stxvp 38, 96(%4) \n\t" + + "addi %4, %4, 128 \n\t" + "xxperm 52, 40, %x10 \n\t" // exchange real and imag part + "xxperm 53, 41, %x10 \n\t" // exchange real and imag part + + "lxvp 48, 0(%3) \n\t" // y0 + "xxperm 54, 42, %x10 \n\t" // exchange real and imag part + "xxperm 55, 43, %x10 \n\t" // exchange real and imag part + "lxvp 50, 32(%3) \n\t" // y2 + + "xxperm 56, 44, %x10 \n\t" // exchange real and imag part + "xxperm 57, 45, %x10 \n\t" // exchange real and imag part + "lxvp 34, 64(%3) \n\t" // y4 + "xxperm 58, 46, %x10 \n\t" // exchange real and imag part + "xxperm 59, 47, %x10 \n\t" // exchange real and imag part + "lxvp 38, 96(%3) \n\t" // y6 + + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddasp 49, 41, 32 \n\t" + "xvmaddasp 50, 42, 32 \n\t" + "xvmaddasp 51, 43, 32 \n\t" + + "xvmaddasp 34, 44, 32 \n\t" + "xvmaddasp 35, 45, 32 \n\t" + "xvmaddasp 38, 46, 32 \n\t" + "xvmaddasp 39, 47, 32 \n\t" + + "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddasp 49, 53, 33 \n\t" + "xvmaddasp 50, 54, 33 \n\t" + "xvmaddasp 51, 55, 33 \n\t" + + "xvmaddasp 34, 56, 33 \n\t" + "xvmaddasp 35, 57, 33 \n\t" + "xvmaddasp 38, 58, 33 \n\t" + "xvmaddasp 39, 59, 33 \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 34, 64(%4) \n\t" + "stxvp 38, 96(%4) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=b" (ytmp) // 4 + : + "m" (*x), + "m" (*mvecp), + "d" (alpha_r), // 7 + "d" (alpha_i), // 8 + "4" (mvecp), // 9 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59" + ); +} diff --git a/kernel/power/caxpy_power10.c b/kernel/power/caxpy_power10.c new file mode 100644 index 000000000..14b8cda67 --- /dev/null +++ b/kernel/power/caxpy_power10.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "caxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + caxpy_kernel_8 (n1, x, y, da_r, da_i); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + From 1f564d729b147fb79831008af820a018f500a73a Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 31 Oct 2020 10:00:48 -0400 Subject: [PATCH 024/121] fix avx2 detection reword commits to make it clearer --- cpuid_x86.c | 16 ++++++++-------- driver/others/dynamic.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 728d459d1..84c12ff43 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -202,7 +202,7 @@ int support_avx(){ if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); if((eax & 6) == 6){ - ret=1; //OS support AVX + ret=1; //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2)) } } return ret; @@ -219,8 +219,8 @@ int support_avx2(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 + if((ebx & (1<<5)) != 0) + ret=1; //CPU supports AVX2 return ret; #else return 0; @@ -235,14 +235,14 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & 32) != 32){ - ret=0; //OS does not even support AVX2 + if((ebx & (1<<5)) == 0){ + ret=0; //cpu does not have avx2 flag } - if((ebx & (1<<31)) != 0){ + if((ebx & (1<<31)) != 0){ //AVX512VL flag xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL - } + ret=1; //OS supports saving zmm registers + } return ret; #else return 0; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 21d2c7948..58f4d8b59 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -330,8 +330,8 @@ int support_avx2(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 + if((ebx & (1<<5)) != 0) + ret=1; //AVX2 flag is set return ret; #else return 0; @@ -346,13 +346,13 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) == 0){ - ret=0; //OS does not even support AVX2 + if((ebx & (1<<5)) == 0){ + ret=0; //cpu does not have avx2 flag } - if((ebx & (1u<<31)) != 0){ + if((ebx & (1<<31)) != 0){ //AVX512VL flag is set xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL + ret=1; //OS supports saving zmm register } return ret; #else From 9fab65e90ad35253014cd9620be0caaabf5f130b Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 1 Nov 2020 00:38:08 +0200 Subject: [PATCH 025/121] add openbsd gfortran --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index f894aa9ac..c12b0f2ef 100644 --- a/f_check +++ b/f_check @@ -33,7 +33,7 @@ if ($compiler eq "") { "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", "pgf95", "pgf90", "pgf77", - "flang", + "flang", "egfortran", "ifort"); OUTER: From 7f26be4802042d7c54bd1645c54adc3e2ff72d50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Nov 2020 00:00:43 +0100 Subject: [PATCH 026/121] Reunify BUFFERSIZE across arm64 platforms to avoid segfaults in DYNAMIC_ARCH --- common_arm64.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 314946282..9cdded305 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -142,14 +142,8 @@ REALNAME: #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -#if defined(CORTEXA57) -#define BUFFER_SIZE (20 << 20) -#elif defined(TSV110) || defined(EMAG8180) #define BUFFER_SIZE (32 << 20) #else -#define BUFFER_SIZE (16 << 20) -#endif -#else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif From dd7a9cc5bf6b926a44b38d13366743691fd6e604 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sat, 31 Oct 2020 18:28:57 -0500 Subject: [PATCH 027/121] POWER10: Change dgemm unroll factors Changing the unroll factors for dgemm to 8 shows improved performance with POWER10 MMA feature. Also made some minor changes in sgemm for edge cases. --- kernel/power/KERNEL.POWER10 | 14 +- kernel/power/dgemm_kernel_power10.c | 431 +++++++++++++-------------- kernel/power/dgemm_ncopy_8_power10.c | 326 ++++++++++++++++++++ kernel/power/sgemm_kernel_power10.c | 70 ++--- param.h | 4 + 5 files changed, 568 insertions(+), 277 deletions(-) create mode 100644 kernel/power/dgemm_ncopy_8_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index b4c7a5e41..28c39051f 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -34,12 +34,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_power10.c -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = dgemm_ncopy_4_power8.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = dgemm_ncopy_8_power10.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) @@ -69,7 +69,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b2a29140e..b531799a6 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif ) { - BLASLONG N = n; BLASLONG i1; #if defined(TRMMKERNEL) BLASLONG off; @@ -158,85 +157,232 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, off = -offset; #endif v4sf_t valpha = { alpha, alpha }; - N = n >> 2; - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j, temp; + BLASLONG j, temp; FLOAT *CO; FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif CO = C; - C += ldc << 2; + C += ldc << 3; AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 3); j++) { - FLOAT *BO; + FLOAT *BO; #if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 4); + REFRESH_POINTERS (8, 8); #else BO = B; temp = k; #endif v4sf_t *rowC; v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; BLASLONG l = 0; - PREFETCH1 (CO, 0); - PREFETCH1 (CO + ldc, 0); - PREFETCH1 (CO + ldc + ldc, 0); - PREFETCH1 (CO + ldc + ldc + ldc, 0); - PREFETCH1 (CO, 128); - PREFETCH1 (CO + ldc, 128); - PREFETCH1 (CO + ldc + ldc, 128); - PREFETCH1 (CO + ldc + ldc + ldc, 128); - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; + __vector_pair rowB, rowB1; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); for (l = 1; l < temp; l++) { - rowA = (vec_t *) & AO[l << 4]; - rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rb = (vec_t *) & BO[l << 3]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]); } SAVE_ACC (&acc0, 0); - SAVE_ACC (&acc2, 4); - SAVE_ACC (&acc1, 2); - SAVE_ACC (&acc3, 6); - SAVE_ACC (&acc4, 8); - SAVE_ACC (&acc6, 12); - SAVE_ACC (&acc5, 10); - SAVE_ACC (&acc7, 14); - AO += temp << 4; - BO += temp << 2; + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + SAVE_ACC (&acc4, 4); + SAVE_ACC1 (&acc5, 4); + SAVE_ACC (&acc6, 6); + SAVE_ACC1 (&acc7, 6); + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + if (m & 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 2]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + if (m & 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + for (l = 1; l < temp; l++) + { + rowA = (vec_t *) & AO[l << 1]; + rb = (vec_t *) & BO[l << 3]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 2; + AO += temp << 1; + BO += temp << 3; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 4) + REFRESH_AFTER_SAVE (2, 8) #endif - CO += 16; } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; + v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; + v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; + v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA * rowB1; + t2 += rowA * rowB2; + t3 += rowA * rowB3; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; + CO[4 * ldc] = t2[0]; + CO[5 * ldc] = t2[1]; + CO[6 * ldc] = t3[0]; + CO[7 * ldc] = t3[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; + CO[4 * ldc] += t2[0]; + CO[5 * ldc] += t2[1]; + CO[6 * ldc] += t3[0]; + CO[7 * ldc] += t3[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + B += k << 3; + } + if (n & 4) + { + BLASLONG j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + for (j = 0; j < (m >> 3); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 4) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 4) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 4) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif B += k << 2; } - N = (n & 3) >> 1; - for (i1 = 0; i1 < N; i1++) + if (n & 2) { - BLASLONG i, j, temp; + BLASLONG j, temp; #if defined(TRMMKERNEL) && defined(LEFT) off = offset; #endif @@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc << 1; AO = A; - i = m >> 4; - for (j = 0; j < i; j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 2); -#else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[0]; - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); - for (l = 1; l < temp; l++) - { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - rowA = (vec_t *) & AO[l << 4]; - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); - __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); - __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); - __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); - } - SAVE2x4_ACC (&acc0, 0); - SAVE2x4_ACC (&acc1, 2); - SAVE2x4_ACC (&acc2, 4); - SAVE2x4_ACC (&acc3, 6); - SAVE2x4_ACC (&acc4, 8); - SAVE2x4_ACC (&acc5, 10); - SAVE2x4_ACC (&acc6, 12); - SAVE2x4_ACC (&acc7, 14); - CO += 16; - AO += temp << 4; - BO += temp << 1; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 2) -#endif - } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 3); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 2) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 2) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 2) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif B += k << 1; } - N = (n & 1) >> 0; - for (i1 = 0; i1 < N; i1++) + if (n & 1) { BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc; AO = A; - i = m; - while (i >= 16) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (16, 1) -#else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - v4sf_t t2 = { 0, 0 }; - v4sf_t t3 = { 0, 0 }; - v4sf_t t4 = { 0, 0 }; - v4sf_t t5 = { 0, 0 }; - v4sf_t t6 = { 0, 0 }; - v4sf_t t7 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowB = { BO[l], BO[l] }; - v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; - v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; - v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; - v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; - v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; - v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; - v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; - v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; - t += rowA * rowB; - t1 += rowA1 * rowB; - t2 += rowA2 * rowB; - t3 += rowA3 * rowB; - t4 += rowA4 * rowB; - t5 += rowA5 * rowB; - t6 += rowA6 * rowB; - t7 += rowA7 * rowB; - } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; - t4 = t4 * valpha; - t5 = t5 * valpha; - t6 = t6 * valpha; - t7 = t7 * valpha; -#if defined(TRMMKERNEL) - CO[0] = t[0]; - CO[1] = t[1]; - CO[2] = t1[0]; - CO[3] = t1[1]; - CO[4] = t2[0]; - CO[5] = t2[1]; - CO[6] = t3[0]; - CO[7] = t3[1]; - CO[8] = t4[0]; - CO[9] = t4[1]; - CO[10] = t5[0]; - CO[11] = t5[1]; - CO[12] = t6[0]; - CO[13] = t6[1]; - CO[14] = t7[0]; - CO[15] = t7[1]; -#else - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t1[0]; - CO[3] += t1[1]; - CO[4] += t2[0]; - CO[5] += t2[1]; - CO[6] += t3[0]; - CO[7] += t3[1]; - CO[8] += t4[0]; - CO[9] += t4[1]; - CO[10] += t5[0]; - CO[11] += t5[1]; - CO[12] += t6[0]; - CO[13] += t6[1]; - CO[14] += t7[0]; - CO[15] += t7[1]; -#endif - AO += temp << 4; - BO += temp; - CO += 16; - i -= 16; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (16, 1) -#endif - } - while (i >= 8) + for (i = 0; i < (m >> 3); i++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 3; BO += temp; CO += 8; - i -= 8; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (8, 1) #endif } - while (i >= 4) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 2; BO += temp; CO += 4; - i -= 4; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (4, 1) #endif } - while (i >= 2) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 1; BO += temp; CO += 2; - i -= 2; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (2, 1) #endif } - while (i >= 1) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO[0] += t * alpha; #endif CO += 1; - i -= 1; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (1, 1) #endif diff --git a/kernel/power/dgemm_ncopy_8_power10.c b/kernel/power/dgemm_ncopy_8_power10.c new file mode 100644 index 000000000..9836c2e7f --- /dev/null +++ b/kernel/power/dgemm_ncopy_8_power10.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include +#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp09, ctemp17, ctemp33; + IFLOAT ctemp25, ctemp41; + IFLOAT ctemp49, ctemp57; + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + PREFETCHA (aoffset1, 384); + PREFETCHA (aoffset2, 384); + PREFETCHA (aoffset3, 384); + PREFETCHA (aoffset4, 384); + PREFETCHA (aoffset5, 384); + PREFETCHA (aoffset6, 384); + PREFETCHA (aoffset7, 384); + PREFETCHA (aoffset8, 384); + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset1 + 2); + __vector double va2 = *(__vector double*)(aoffset1 + 4); + __vector double va3 = *(__vector double*)(aoffset1 + 6); + + __vector double va4 = *(__vector double*)(aoffset2 + 0); + __vector double va5 = *(__vector double*)(aoffset2 + 2); + __vector double va6 = *(__vector double*)(aoffset2 + 4); + __vector double va7 = *(__vector double*)(aoffset2 + 6); + + __vector double va8 = *(__vector double*)(aoffset3 + 0); + __vector double va9 = *(__vector double*)(aoffset3 + 2); + __vector double va10 = *(__vector double*)(aoffset3 + 4); + __vector double va11 = *(__vector double*)(aoffset3 + 6); + + __vector double va12 = *(__vector double*)(aoffset4 + 0); + __vector double va13 = *(__vector double*)(aoffset4 + 2); + __vector double va14 = *(__vector double*)(aoffset4 + 4); + __vector double va15 = *(__vector double*)(aoffset4 + 6); + + __vector double va16 = *(__vector double*)(aoffset5 + 0); + __vector double va17 = *(__vector double*)(aoffset5 + 2); + __vector double va18 = *(__vector double*)(aoffset5 + 4); + __vector double va19 = *(__vector double*)(aoffset5 + 6); + + __vector double va20 = *(__vector double*)(aoffset6 + 0); + __vector double va21 = *(__vector double*)(aoffset6 + 2); + __vector double va22 = *(__vector double*)(aoffset6 + 4); + __vector double va23 = *(__vector double*)(aoffset6 + 6); + + __vector double va24 = *(__vector double*)(aoffset7 + 0); + __vector double va25 = *(__vector double*)(aoffset7 + 2); + __vector double va26 = *(__vector double*)(aoffset7 + 4); + __vector double va27 = *(__vector double*)(aoffset7 + 6); + + __vector double va28 = *(__vector double*)(aoffset8 + 0); + __vector double va29 = *(__vector double*)(aoffset8 + 2); + __vector double va30 = *(__vector double*)(aoffset8 + 4); + __vector double va31 = *(__vector double*)(aoffset8 + 6); + + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0); + *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0); + *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0); + *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3); + *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3); + *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3); + *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3); + + *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0); + *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0); + *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0); + *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0); + *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3); + *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3); + *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3); + *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3); + + *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0); + *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0); + *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0); + *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0); + *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3); + *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3); + *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3); + *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3); + + *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0); + *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0); + *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0); + *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0); + *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3); + *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3); + *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3); + *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3); + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + PREFETCHA (aoffset1, 384); + PREFETCHA (aoffset2, 384); + PREFETCHA (aoffset3, 384); + PREFETCHA (aoffset4, 384); + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset1 + 2); + __vector double va2 = *(__vector double*)(aoffset2 + 0); + __vector double va3 = *(__vector double*)(aoffset2 + 2); + __vector double va4 = *(__vector double*)(aoffset3 + 0); + __vector double va5 = *(__vector double*)(aoffset3 + 2); + __vector double va6 = *(__vector double*)(aoffset4 + 0); + __vector double va7 = *(__vector double*)(aoffset4 + 2); + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0); + *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3); + *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3); + *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0); + *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0); + *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3); + *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3); + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + __vector double va0 = *(__vector double*)(aoffset1 + 0); + __vector double va1 = *(__vector double*)(aoffset2 + 0); + *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0); + *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3); + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 9fbf84695..80f495f70 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif ) { - BLASLONG N = n; BLASLONG i1; #if defined(TRMMKERNEL) BLASLONG off; @@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif v4sf_t valpha = { alpha, alpha, alpha, alpha }; - N = n >> 3; - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j, temp; + BLASLONG j, temp; FLOAT *CO; FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) @@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 4); j++) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, #endif CO += 16; } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 8) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 8) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 8) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 3; } - N = (n & 7) >> 2; - for (i1 = 0; i1 < N; i1++) + if (n & 4) { BLASLONG i, j, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (16, 4) #endif } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 4) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 4) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 4) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 2; } - N = (n & 3) >> 1; - for (i1 = 0; i1 < N; i1++) + if (n & 2) { BLASLONG i, j, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (16, 2) #endif } - i = (m & 15) >> 3; - for (j = 0; j < i; j++) + if (m & 8) { FLOAT *BO; v4sf_t *rowC; @@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (8, 2) #endif } - i = (m & 7) >> 2; - for (j = 0; j < i; j++) + if (m & 4) { FLOAT *BO; v4sf_t *rowC; @@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (4, 2) #endif } - i = (m & 3) >> 1; - for (j = 0; j < i; j++) + if (m & 2) { FLOAT *BO; BLASLONG l = 0; @@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, REFRESH_AFTER_SAVE (2, 2) #endif } - i = (m & 1) >> 0; - for (j = 0; j < i; j++) + if (m & 1) { FLOAT *BO; BLASLONG l = 0; @@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, B += k << 1; } - N = (n & 1) >> 0; - for (i1 = 0; i1 < N; i1++) + if (n & 1) { BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) @@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO = C; C += ldc; AO = A; - i = m; - while (i >= 16) + for (i = 0; i < (m >> 4); i++) { FLOAT *BO; BLASLONG l = 0; @@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 4; BO += temp; CO += 16; - i -= 16; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (16, 1) #endif } - while (i >= 8) + if (m & 8) { FLOAT *BO; BLASLONG l = 0; @@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 3; BO += temp; CO += 8; - i -= 8; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (8, 1) #endif } - while (i >= 4) + if (m & 4) { FLOAT *BO; BLASLONG l = 0; @@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 2; BO += temp; CO += 4; - i -= 4; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (4, 1) #endif } - while (i >= 2) + if (m & 2) { FLOAT *BO; BLASLONG l = 0; @@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, AO += temp << 1; BO += temp; CO += 2; - i -= 2; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (2, 1) #endif } - while (i >= 1) + if (m & 1) { FLOAT *BO; #if defined(TRMMKERNEL) @@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, CO[0] += t * alpha; #endif CO += 1; - i -= 1; #if defined(TRMMKERNEL) REFRESH_AFTER_SAVE (1, 1) #endif diff --git a/param.h b/param.h index f3ddde6a1..2047e4776 100644 --- a/param.h +++ b/param.h @@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 832 #define SBGEMM_DEFAULT_Q 1026 #define SBGEMM_DEFAULT_R 4096 +#undef DGEMM_DEFAULT_UNROLL_M +#undef DGEMM_DEFAULT_UNROLL_N +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 #endif #if defined(SPARC) && defined(V7) From 40a93c232b6a9a09fb0cf10a8de5ba6ca94070a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Nov 2020 21:58:26 +0100 Subject: [PATCH 028/121] Disable EXPRECISION for DYNAMIC_ARCH in combination with TARGET=GENERIC NO_EXPRECISION is disabled for the GENERIC_TARGET already, so prevent mixing with code parts that use a different float size by default --- Makefile.system | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index 52d3e2cdc..b62eab379 100644 --- a/Makefile.system +++ b/Makefile.system @@ -93,6 +93,11 @@ endif ifdef TARGET GETARCH_FLAGS := -DFORCE_$(TARGET) GETARCH_FLAGS += -DUSER_TARGET +ifeq ($(TARGET), GENERIC) +ifeq ($(DYNAMIC_ARCH), 1) +override NO_EXPRECISION=1 +endif +endif endif # Force fallbacks for 32bit From 6baf8af6588725ee720bcfad12e235a61df5deb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Nov 2020 22:11:48 +0100 Subject: [PATCH 029/121] Disable EXPRECISION for the combination of DYNAMIC_CORE and GENERIC target --- cmake/os.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/os.cmake b/cmake/os.cmake index c644bc3f7..98428c624 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -84,6 +84,10 @@ if (X86) set(NO_EXPRECISION 1) endif () +if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC)) + set(NO_EXPRECISION 1) +endif () + if (UTEST_CHECK) set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") set(SANITY_CHECK 1) From e5f8c2bf8ae438ec6b626f9fe6711101ad004d3d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 1 Nov 2020 22:25:43 +0100 Subject: [PATCH 030/121] typo fix --- cmake/os.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/os.cmake b/cmake/os.cmake index 98428c624..1eb2b7472 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -84,7 +84,7 @@ if (X86) set(NO_EXPRECISION 1) endif () -if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC)) +if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC")) set(NO_EXPRECISION 1) endif () From 60997ddd73b00dcdd86086e166483fcc70aa2a3d Mon Sep 17 00:00:00 2001 From: Aisha Tammy Date: Mon, 2 Nov 2020 13:04:53 +0000 Subject: [PATCH 031/121] allow setting soname without suffix or prefix Allows to create a library with a different SONAME without the need to add suffixes to symbols Backwards compatible and should have no effect on the workflow and previous users. Useful for allowing INTERFACE64 library alongside the standard library without file conflicts --- Makefile.install | 16 ++++++++-------- Makefile.system | 8 ++++++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Makefile.install b/Makefile.install index 7c1a3ca43..e8b64465f 100644 --- a/Makefile.install +++ b/Makefile.install @@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) -OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas +OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig @@ -150,13 +150,13 @@ endif endif #Generating openblas.pc - @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" #Generating OpenBLASConfig.cmake diff --git a/Makefile.system b/Makefile.system index 52d3e2cdc..afbdb6bab 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1263,10 +1263,14 @@ ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif +ifndef LIBSONAMEBASE +LIBSONAMEBASE = openblas +endif + ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) From b9bc76aec4c869fed0b5cfbbe11336206a6ff5ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Nov 2020 22:43:50 +0100 Subject: [PATCH 032/121] Add files via upload --- cmake/os.cmake | 4 +++- cmake/prebuild.cmake | 30 ++++++++++++++++++++++++++++++ cmake/system.cmake | 31 +++++++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/cmake/os.cmake b/cmake/os.cmake index 1eb2b7472..feb4c05d1 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -84,9 +84,11 @@ if (X86) set(NO_EXPRECISION 1) endif () -if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC")) +if (DYNAMIC_ARCH) +if (${TARGET} STREQUAL "GENERIC") set(NO_EXPRECISION 1) endif () +endif () if (UTEST_CHECK) set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 3e38abbf5..b1b4c501a 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -139,6 +139,36 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM3M_UNROLL_N 4) set(ZGEMM3M_UNROLL_M 4) set(ZGEMM3M_UNROLL_N 4) + elseif ("${TCORE}" STREQUAL "BARCELONA") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_SSE3\n") + elseif ("${TCORE}" STREQUAL "STEAMROLLER") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_SSE3\n") + elseif ("${TCORE}" STREQUAL "EXCAVATOR") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_SSE3\n") + elseif ("${TCORE}" STREQUAL "NEHALEM") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_SSE3\n") + elseif ("${TCORE}" STREQUAL "PRESCOTT") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_SSE3\n") + elseif ("${TCORE}" STREQUAL "SANDYBRIDGE") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_AVX\n") + elseif ("${TCORE}" STREQUAL "HASWELL") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_AVX2\n") + elseif ("${TCORE}" STREQUAL "ZEN") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_AVX2\n") + elseif ("${TCORE}" STREQUAL "SKYLAKEX") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_AVX512\n") + elseif ("${TCORE}" STREQUAL "COOPERLAKE") + file(APPEND ${TARGET_CONF_TEMP} + "#define HAVE_AVX512\n") elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index 4cc46236d..83b79bab2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -64,12 +64,39 @@ if (DEFINED TARGET) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") endif() endif() + if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") + endif() + if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx") + endif() + if (${TARGET} STREQUAL "BARCELONA") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (${TARGET} STREQUAL "STEAMROLLER") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (${TARGET} STREQUAL "EXCAVATOR") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (${TARGET} STREQUAL "PILEDRIVER") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (${TARGET} STREQUAL "PRESCOTT") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (${TARGET} STREQUAL "NEHALEM") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (${TARGET} STREQUAL "CORE2") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() if (DEFINED HAVE_SSE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") endif() From a9f9354296d448ffc087fc618d4fc9c39b56f72c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Nov 2020 23:17:46 +0100 Subject: [PATCH 033/121] Fix target test --- cmake/os.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/os.cmake b/cmake/os.cmake index feb4c05d1..e24059dd5 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -85,10 +85,12 @@ if (X86) endif () if (DYNAMIC_ARCH) +if (TARGET) if (${TARGET} STREQUAL "GENERIC") set(NO_EXPRECISION 1) endif () endif () +endif () if (UTEST_CHECK) set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK") From 0155cd53a3c29e8a57cdef504a4a685bc7ea098a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Nov 2020 23:45:49 +0100 Subject: [PATCH 034/121] Add -msse3 where needed for DYNAMIC_ARCH builds --- cmake/system.cmake | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 83b79bab2..48d206b12 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -67,34 +67,31 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2") endif() endif() + if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") + endif() if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") endif() if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx") endif() - if (${TARGET} STREQUAL "BARCELONA") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "STEAMROLLER") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "EXCAVATOR") + if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() - if (${TARGET} STREQUAL "PILEDRIVER") + if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() - if (${TARGET} STREQUAL "PRESCOTT") + if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() - if (${TARGET} STREQUAL "NEHALEM") + if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() - if (${TARGET} STREQUAL "CORE2") + if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() if (DEFINED HAVE_SSE) From 8cc73fee98684b49fdd1869e44b3d6a816cdb407 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Nov 2020 23:47:04 +0100 Subject: [PATCH 035/121] Export NO_EXPRECISION after overriding for DYNAMIC_ARCH with GENERIC target --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index b62eab379..ca302a98a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -96,6 +96,7 @@ GETARCH_FLAGS += -DUSER_TARGET ifeq ($(TARGET), GENERIC) ifeq ($(DYNAMIC_ARCH), 1) override NO_EXPRECISION=1 +export NO_EXPRECiSION endif endif endif From d9ba49165af15d535d9b9955bd248eab4d259f06 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Sun, 27 Sep 2020 10:38:19 +0800 Subject: [PATCH 036/121] Improve the performance of rot by using AVX512 and AVX2 intrinsic --- driver/others/blas_l1_thread.c | 2 +- driver/others/blas_server_win32.c | 11 +- kernel/x86_64/KERNEL.HASWELL | 3 + kernel/x86_64/drot.c | 139 +++++++++++++++++++++++++ kernel/x86_64/drot_microk_haswell-2.c | 87 ++++++++++++++++ kernel/x86_64/drot_microk_skylakex-2.c | 94 +++++++++++++++++ kernel/x86_64/srot.c | 139 +++++++++++++++++++++++++ kernel/x86_64/srot_microk_haswell-2.c | 87 ++++++++++++++++ kernel/x86_64/srot_microk_skylakex-2.c | 91 ++++++++++++++++ 9 files changed, 648 insertions(+), 5 deletions(-) create mode 100644 kernel/x86_64/drot.c create mode 100644 kernel/x86_64/drot_microk_haswell-2.c create mode 100644 kernel/x86_64/drot_microk_skylakex-2.c create mode 100644 kernel/x86_64/srot.c create mode 100644 kernel/x86_64/srot_microk_haswell-2.c create mode 100644 kernel/x86_64/srot_microk_skylakex-2.c diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index 04acbcc5f..06039c952 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha break; } - mode |= BLAS_LEGACY; + if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY; for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index d2cc91757..f47908c70 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ routine = queue -> routine; - if (!(queue -> mode & BLAS_LEGACY)) { + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); - } else { - legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); - } if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index b979fc0ae..81eaf96ac 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -102,3 +102,6 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c new file mode 100644 index 000000000..a312b7ff9 --- /dev/null +++ b/kernel/x86_64/drot.c @@ -0,0 +1,139 @@ +#include "common.h" + +#if defined(SKYLAKEX) +#include "drot_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "drot_microk_haswell-2.c" +#endif + +#ifndef HAVE_DROT_KERNEL + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + FLOAT f0, f1, f2, f3; + FLOAT x0, x1, x2, x3; + FLOAT g0, g1, g2, g3; + FLOAT y0, y1, y2, y3; + + FLOAT* xp = x; + FLOAT* yp = y; + + BLASLONG n1 = n & (~7); + + while (i < n1) { + x0 = xp[0]; + y0 = yp[0]; + x1 = xp[1]; + y1 = yp[1]; + x2 = xp[2]; + y2 = yp[2]; + x3 = xp[3]; + y3 = yp[3]; + + f0 = c*x0 + s*y0; + g0 = c*y0 - s*x0; + f1 = c*x1 + s*y1; + g1 = c*y1 - s*x1; + f2 = c*x2 + s*y2; + g2 = c*y2 - s*x2; + f3 = c*x3 + s*y3; + g3 = c*y3 - s*x3; + + xp[0] = f0; + yp[0] = g0; + xp[1] = f1; + yp[1] = g1; + xp[2] = f2; + yp[2] = g2; + xp[3] = f3; + yp[3] = g3; + + xp += 4; + yp += 4; + i += 4; + } + + while (i < n) { + FLOAT temp = c*x[i] + s*y[i]; + y[i] = c*y[i] - s*x[i]; + x[i] = temp; + + i++; + } +} + +#endif +static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + FLOAT temp; + + if (n <= 0) + return; + if ((inc_x == 1) && (inc_y == 1)) { + drot_kernel(n, x, y, c, s); + } + else { + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + + ix += inc_x; + iy += inc_y; + i++; + } + } + return; +} + + +#if defined(SMP) +static int rot_thread_function(blas_arg_t *args) +{ + + rot_compute(args->m, + args->a, args->lda, + args->b, args->ldb, + ((FLOAT *)args->alpha)[0], + ((FLOAT *)args->alpha)[1]); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ +#if defined(SMP) + int nthreads; + FLOAT alpha[2]={c, s}; + FLOAT dummy_c; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 100000) { + nthreads = 1; + } + else { + nthreads = num_cpu_avail(1); + } + + if (nthreads == 1) { + rot_compute(n, x, inc_x, y, inc_y, c, s); + } + else { +#if defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; +#else + int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + } +#else + rot_compute(n, x, inc_x, y, inc_y, c, s); +#endif + return 0; +} diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c new file mode 100644 index 000000000..72a87696e --- /dev/null +++ b/kernel/x86_64/drot_microk_haswell-2.c @@ -0,0 +1,87 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DROT_KERNEL 1 + +#include +#include + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + + BLASLONG tail_index_4 = n&(~3); + BLASLONG tail_index_16 = n&(~15); + + __m256d c_256, s_256; + if (n >= 4) { + c_256 = _mm256_set1_pd(c); + s_256 = _mm256_set1_pd(s); + } + + __m256d x0, x1, x2, x3; + __m256d y0, y1, y2, y3; + __m256d t0, t1, t2, t3; + + for (i = 0; i < tail_index_16; i += 16) { + x0 = _mm256_loadu_pd(&x[i + 0]); + x1 = _mm256_loadu_pd(&x[i + 4]); + x2 = _mm256_loadu_pd(&x[i + 8]); + x3 = _mm256_loadu_pd(&x[i +12]); + y0 = _mm256_loadu_pd(&y[i + 0]); + y1 = _mm256_loadu_pd(&y[i + 4]); + y2 = _mm256_loadu_pd(&y[i + 8]); + y3 = _mm256_loadu_pd(&y[i +12]); + + t0 = _mm256_mul_pd(s_256, y0); + t1 = _mm256_mul_pd(s_256, y1); + t2 = _mm256_mul_pd(s_256, y2); + t3 = _mm256_mul_pd(s_256, y3); + + t0 = _mm256_fmadd_pd(c_256, x0, t0); + t1 = _mm256_fmadd_pd(c_256, x1, t1); + t2 = _mm256_fmadd_pd(c_256, x2, t2); + t3 = _mm256_fmadd_pd(c_256, x3, t3); + + _mm256_storeu_pd(&x[i + 0], t0); + _mm256_storeu_pd(&x[i + 4], t1); + _mm256_storeu_pd(&x[i + 8], t2); + _mm256_storeu_pd(&x[i +12], t3); + + t0 = _mm256_mul_pd(s_256, x0); + t1 = _mm256_mul_pd(s_256, x1); + t2 = _mm256_mul_pd(s_256, x2); + t3 = _mm256_mul_pd(s_256, x3); + + t0 = _mm256_fmsub_pd(c_256, y0, t0); + t1 = _mm256_fmsub_pd(c_256, y1, t1); + t2 = _mm256_fmsub_pd(c_256, y2, t2); + t3 = _mm256_fmsub_pd(c_256, y3, t3); + + _mm256_storeu_pd(&y[i + 0], t0); + _mm256_storeu_pd(&y[i + 4], t1); + _mm256_storeu_pd(&y[i + 8], t2); + _mm256_storeu_pd(&y[i +12], t3); + + } + + for (i = tail_index_16; i < tail_index_4; i += 4) { + x0 = _mm256_loadu_pd(&x[i]); + y0 = _mm256_loadu_pd(&y[i]); + + t0 = _mm256_mul_pd(s_256, y0); + t0 = _mm256_fmadd_pd(c_256, x0, t0); + _mm256_storeu_pd(&x[i], t0); + + t0 = _mm256_mul_pd(s_256, x0); + t0 = _mm256_fmsub_pd(c_256, y0, t0); + _mm256_storeu_pd(&y[i], t0); + } + + for (i = tail_index_4; i < n; ++i) { + FLOAT temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } +} +#endif diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c new file mode 100644 index 000000000..4e862e663 --- /dev/null +++ b/kernel/x86_64/drot_microk_skylakex-2.c @@ -0,0 +1,94 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_DROT_KERNEL 1 + +#include +#include + +static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG n1 = n; + + BLASLONG tail_index_8 = 0; + BLASLONG tail_index_32 = 0; + + __m512d c_512 = _mm512_set1_pd(c); + __m512d s_512 = _mm512_set1_pd(s); + + tail_index_8 = n1 & (~7); + tail_index_32 = n1 & (~31); + + + __m512d x0, x1, x2, x3; + __m512d y0, y1, y2, y3; + __m512d t0, t1, t2, t3; + + for (i = 0; i < tail_index_32; i += 32) { + x0 = _mm512_loadu_pd(&x[i + 0]); + x1 = _mm512_loadu_pd(&x[i + 8]); + x2 = _mm512_loadu_pd(&x[i +16]); + x3 = _mm512_loadu_pd(&x[i +24]); + y0 = _mm512_loadu_pd(&y[i + 0]); + y1 = _mm512_loadu_pd(&y[i + 8]); + y2 = _mm512_loadu_pd(&y[i +16]); + y3 = _mm512_loadu_pd(&y[i +24]); + + t0 = _mm512_mul_pd(s_512, y0); + t1 = _mm512_mul_pd(s_512, y1); + t2 = _mm512_mul_pd(s_512, y2); + t3 = _mm512_mul_pd(s_512, y3); + + t0 = _mm512_fmadd_pd(c_512, x0, t0); + t1 = _mm512_fmadd_pd(c_512, x1, t1); + t2 = _mm512_fmadd_pd(c_512, x2, t2); + t3 = _mm512_fmadd_pd(c_512, x3, t3); + + _mm512_storeu_pd(&x[i + 0], t0); + _mm512_storeu_pd(&x[i + 8], t1); + _mm512_storeu_pd(&x[i +16], t2); + _mm512_storeu_pd(&x[i +24], t3); + + t0 = _mm512_mul_pd(s_512, x0); + t1 = _mm512_mul_pd(s_512, x1); + t2 = _mm512_mul_pd(s_512, x2); + t3 = _mm512_mul_pd(s_512, x3); + + t0 = _mm512_fmsub_pd(c_512, y0, t0); + t1 = _mm512_fmsub_pd(c_512, y1, t1); + t2 = _mm512_fmsub_pd(c_512, y2, t2); + t3 = _mm512_fmsub_pd(c_512, y3, t3); + + _mm512_storeu_pd(&y[i + 0], t0); + _mm512_storeu_pd(&y[i + 8], t1); + _mm512_storeu_pd(&y[i +16], t2); + _mm512_storeu_pd(&y[i +24], t3); + } + + for (i = tail_index_32; i < tail_index_8; i += 8) { + x0 = _mm512_loadu_pd(&x[i]); + y0 = _mm512_loadu_pd(&y[i]); + + t0 = _mm512_mul_pd(s_512, y0); + t0 = _mm512_fmadd_pd(c_512, x0, t0); + _mm512_storeu_pd(&x[i], t0); + + t0 = _mm512_mul_pd(s_512, x0); + t0 = _mm512_fmsub_pd(c_512, y0, t0); + _mm512_storeu_pd(&y[i], t0); + } + + if ((n1&7) > 0) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7))); + __m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]); + __m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]); + __m512d temp = _mm512_mul_pd(s_512, tail_y); + temp = _mm512_fmadd_pd(c_512, tail_x, temp); + _mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp); + temp = _mm512_mul_pd(s_512, tail_x); + temp = _mm512_fmsub_pd(c_512, tail_y, temp); + _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp); + } +} +#endif diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c new file mode 100644 index 000000000..021c20d82 --- /dev/null +++ b/kernel/x86_64/srot.c @@ -0,0 +1,139 @@ +#include "common.h" + +#if defined(SKYLAKEX) +#include "srot_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "srot_microk_haswell-2.c" +#endif + +#ifndef HAVE_SROT_KERNEL + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + FLOAT f0, f1, f2, f3; + FLOAT x0, x1, x2, x3; + FLOAT g0, g1, g2, g3; + FLOAT y0, y1, y2, y3; + + FLOAT* xp = x; + FLOAT* yp = y; + + BLASLONG n1 = n & (~7); + + while (i < n1) { + x0 = xp[0]; + y0 = yp[0]; + x1 = xp[1]; + y1 = yp[1]; + x2 = xp[2]; + y2 = yp[2]; + x3 = xp[3]; + y3 = yp[3]; + + f0 = c*x0 + s*y0; + g0 = c*y0 - s*x0; + f1 = c*x1 + s*y1; + g1 = c*y1 - s*x1; + f2 = c*x2 + s*y2; + g2 = c*y2 - s*x2; + f3 = c*x3 + s*y3; + g3 = c*y3 - s*x3; + + xp[0] = f0; + yp[0] = g0; + xp[1] = f1; + yp[1] = g1; + xp[2] = f2; + yp[2] = g2; + xp[3] = f3; + yp[3] = g3; + + xp += 4; + yp += 4; + i += 4; + } + + while (i < n) { + FLOAT temp = c*x[i] + s*y[i]; + y[i] = c*y[i] - s*x[i]; + x[i] = temp; + + i++; + } +} + +#endif +static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + FLOAT temp; + + if (n <= 0) + return; + if ((inc_x == 1) && (inc_y == 1)) { + srot_kernel(n, x, y, c, s); + } + else { + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + + ix += inc_x; + iy += inc_y; + i++; + } + } + return; +} + + +#if defined(SMP) +static int rot_thread_function(blas_arg_t *args) +{ + + rot_compute(args->m, + args->a, args->lda, + args->b, args->ldb, + ((float *)args->alpha)[0], + ((float *)args->alpha)[1]); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ +#if defined(SMP) + int nthreads; + FLOAT alpha[2]={c, s}; + FLOAT dummy_c; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 100000) { + nthreads = 1; + } + else { + nthreads = num_cpu_avail(1); + } + + if (nthreads == 1) { + rot_compute(n, x, inc_x, y, inc_y, c, s); + } + else { +#if defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD; +#else + int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + } +#else + rot_compute(n, x, inc_x, y, inc_y, c, s); +#endif + return 0; +} diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c new file mode 100644 index 000000000..cba962042 --- /dev/null +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -0,0 +1,87 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SROT_KERNEL 1 + +#include +#include + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + + BLASLONG tail_index_8 = n&(~7); + BLASLONG tail_index_32 = n&(~31); + + __m256 c_256, s_256; + if (n >= 8) { + c_256 = _mm256_set1_ps(c); + s_256 = _mm256_set1_ps(s); + } + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + __m256 t0, t1, t2, t3; + + for (i = 0; i < tail_index_32; i += 32) { + x0 = _mm256_loadu_ps(&x[i + 0]); + x1 = _mm256_loadu_ps(&x[i + 8]); + x2 = _mm256_loadu_ps(&x[i +16]); + x3 = _mm256_loadu_ps(&x[i +24]); + y0 = _mm256_loadu_ps(&y[i + 0]); + y1 = _mm256_loadu_ps(&y[i + 8]); + y2 = _mm256_loadu_ps(&y[i +16]); + y3 = _mm256_loadu_ps(&y[i +24]); + + t0 = _mm256_mul_ps(s_256, y0); + t1 = _mm256_mul_ps(s_256, y1); + t2 = _mm256_mul_ps(s_256, y2); + t3 = _mm256_mul_ps(s_256, y3); + + t0 = _mm256_fmadd_ps(c_256, x0, t0); + t1 = _mm256_fmadd_ps(c_256, x1, t1); + t2 = _mm256_fmadd_ps(c_256, x2, t2); + t3 = _mm256_fmadd_ps(c_256, x3, t3); + + _mm256_storeu_ps(&x[i + 0], t0); + _mm256_storeu_ps(&x[i + 8], t1); + _mm256_storeu_ps(&x[i +16], t2); + _mm256_storeu_ps(&x[i +24], t3); + + t0 = _mm256_mul_ps(s_256, x0); + t1 = _mm256_mul_ps(s_256, x1); + t2 = _mm256_mul_ps(s_256, x2); + t3 = _mm256_mul_ps(s_256, x3); + + t0 = _mm256_fmsub_ps(c_256, y0, t0); + t1 = _mm256_fmsub_ps(c_256, y1, t1); + t2 = _mm256_fmsub_ps(c_256, y2, t2); + t3 = _mm256_fmsub_ps(c_256, y3, t3); + + _mm256_storeu_ps(&y[i + 0], t0); + _mm256_storeu_ps(&y[i + 8], t1); + _mm256_storeu_ps(&y[i +16], t2); + _mm256_storeu_ps(&y[i +24], t3); + + } + + for (i = tail_index_32; i < tail_index_8; i += 8) { + x0 = _mm256_loadu_ps(&x[i]); + y0 = _mm256_loadu_ps(&y[i]); + + t0 = _mm256_mul_ps(s_256, y0); + t0 = _mm256_fmadd_ps(c_256, s0, t0); + _mm256_storeu_ps(&x[i], t0); + + t0 = _mm256_mul_ps(s_256, x0); + t0 = _mm256_fmsub_ps(c_256, y0, t0); + _mm256_storeu_ps(&y[i], t0); + } + + for (i = tail_index_8; i < n; ++i) { + FLOAT temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } +} +#endif diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c new file mode 100644 index 000000000..a21d1cf64 --- /dev/null +++ b/kernel/x86_64/srot_microk_skylakex-2.c @@ -0,0 +1,91 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SROT_KERNEL 1 + +#include +#include + +static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + __m512 c_512, s_512; + c_512 = _mm512_set1_ps(c); + s_512 = _mm512_set1_ps(s); + + BLASLONG tail_index_16 = n&(~15); + BLASLONG tail_index_64 = n&(~63); + + + __m512 x0, x1, x2, x3; + __m512 y0, y1, y2, y3; + __m512 t0, t1, t2, t3; + + for (i = 0; i < tail_index_64; i += 64) { + x0 = _mm512_loadu_ps(&x[i + 0]); + x1 = _mm512_loadu_ps(&x[i +16]); + x2 = _mm512_loadu_ps(&x[i +32]); + x3 = _mm512_loadu_ps(&x[i +48]); + y0 = _mm512_loadu_ps(&y[i + 0]); + y1 = _mm512_loadu_ps(&y[i +16]); + y2 = _mm512_loadu_ps(&y[i +32]); + y3 = _mm512_loadu_ps(&y[i +48]); + + t0 = _mm512_mul_ps(s_512, y0); + t1 = _mm512_mul_ps(s_512, y1); + t2 = _mm512_mul_ps(s_512, y2); + t3 = _mm512_mul_ps(s_512, y3); + + t0 = _mm512_fmadd_ps(c_512, x0, t0); + t1 = _mm512_fmadd_ps(c_512, x1, t1); + t2 = _mm512_fmadd_ps(c_512, x2, t2); + t3 = _mm512_fmadd_ps(c_512, x3, t3); + + _mm512_storeu_ps(&x[i + 0], t0); + _mm512_storeu_ps(&x[i +16], t1); + _mm512_storeu_ps(&x[i +32], t2); + _mm512_storeu_ps(&x[i +48], t3); + + t0 = _mm512_mul_ps(s_512, x0); + t1 = _mm512_mul_ps(s_512, x1); + t2 = _mm512_mul_ps(s_512, x2); + t3 = _mm512_mul_ps(s_512, x3); + + t0 = _mm512_fmsub_ps(c_512, y0, t0); + t1 = _mm512_fmsub_ps(c_512, y1, t1); + t2 = _mm512_fmsub_ps(c_512, y2, t2); + t3 = _mm512_fmsub_ps(c_512, y3, t3); + + _mm512_storeu_ps(&y[i + 0], t0); + _mm512_storeu_ps(&y[i +16], t1); + _mm512_storeu_ps(&y[i +32], t2); + _mm512_storeu_ps(&y[i +48], t3); + } + + for (i = tail_index_64; i < tail_index_16; i += 16) { + x0 = _mm512_loadu_ps(&x[i]); + y0 = _mm512_loadu_ps(&y[i]); + + t0 = _mm512_mul_ps(s_512, y0); + t0 = _mm512_fmadd_ps(c_512, x0, t0); + _mm512_storeu_ps(&x[i], t0); + + t0 = _mm512_mul_ps(s_512, x0); + t0 = _mm512_fmsub_ps(c_512, y0, t0); + _mm512_storeu_ps(&y[i], t0); + } + + + if ((n & 15) > 0) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15))); + __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]); + __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]); + __m512 temp = _mm512_mul_ps(s_512, tail_y); + temp = _mm512_fmadd_ps(c_512, tail_x, temp); + _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp); + temp = _mm512_mul_ps(s_512, tail_x); + temp = _mm512_fmsub_ps(c_512, tail_y, temp); + _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp); + } +} +#endif From 725ffbf041b021d2f3602b2313e4027aab19ee89 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Thu, 5 Nov 2020 16:25:17 +0800 Subject: [PATCH 037/121] fix typo --- kernel/x86_64/srot_microk_haswell-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c index cba962042..8e245cc8f 100644 --- a/kernel/x86_64/srot_microk_haswell-2.c +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -70,7 +70,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) y0 = _mm256_loadu_ps(&y[i]); t0 = _mm256_mul_ps(s_256, y0); - t0 = _mm256_fmadd_ps(c_256, s0, t0); + t0 = _mm256_fmadd_ps(c_256, x0, t0); _mm256_storeu_ps(&x[i], t0); t0 = _mm256_mul_ps(s_256, x0); From 28d2dfe2b3bd6c779137fcb53451f97f47b78b37 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 12:17:49 +0100 Subject: [PATCH 038/121] Fix macro name used in ifdef --- kernel/arm/zdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index ba0e57eb5..73ae3acd7 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__POWER__) +#if !defined(__PPC__) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else From 438a8e5624ef1adfe98f989655ca398866143458 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:26:12 +0100 Subject: [PATCH 039/121] Fix placement of getarch call and spurious cpu property accumulation in DYNAMIC_ARCH builds --- cmake/prebuild.cmake | 45 ++++++---------- cmake/system.cmake | 124 ++++++++++++++++++++----------------------- 2 files changed, 73 insertions(+), 96 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index b1b4c501a..da7686c33 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM3M_UNROLL_N 4) set(ZGEMM3M_UNROLL_M 4) set(ZGEMM3M_UNROLL_N 4) - elseif ("${TCORE}" STREQUAL "BARCELONA") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "STEAMROLLER") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "EXCAVATOR") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "NEHALEM") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "PRESCOTT") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_SSE3\n") - elseif ("${TCORE}" STREQUAL "SANDYBRIDGE") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX\n") - elseif ("${TCORE}" STREQUAL "HASWELL") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX2\n") - elseif ("${TCORE}" STREQUAL "ZEN") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX2\n") - elseif ("${TCORE}" STREQUAL "SKYLAKEX") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX512\n") - elseif ("${TCORE}" STREQUAL "COOPERLAKE") - file(APPEND ${TARGET_CONF_TEMP} - "#define HAVE_AVX512\n") elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" @@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING) MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") endif () endif () + unset (HAVE_AVX2) + unset (HAVE_AVX) + unset (HAVE_FMA3) + unset (HAVE_MMX) + unset (HAVE_SSE) + unset (HAVE_SSE2) + unset (HAVE_SSE3) + unset (HAVE_SSSE3) + unset (HAVE_SSE4A) + unset (HAVE_SSE4_1) + unset (HAVE_SSE4_2) + unset (HAVE_NEON) + unset (HAVE_VFP) + unset (HAVE_VFPV3) + unset (HAVE_VFPV4) message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way diff --git a/cmake/system.cmake b/cmake/system.cmake index 48d206b12..66e95c6d3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () endif () -if (DEFINED TARGET) - if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) -# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") - else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") - endif() -# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") -# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") -# endif() - endif() - if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") - endif() - if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") - endif() - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2") - endif() - endif() - if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") - endif() - if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2") - endif() - if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx") - endif() - if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (DEFINED HAVE_SSE) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") - endif() - if (DEFINED HAVE_SSE2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") - endif() - if (DEFINED HAVE_SSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() - if (DEFINED HAVE_SSSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") - endif() - if (DEFINED HAVE_SSE4_1) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") - endif() -endif() if (DEFINED TARGET) + message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --") message(STATUS "Targeting the ${TARGET} architecture.") set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () @@ -211,6 +146,63 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") +if (DEFINED TARGET) + if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) +# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() +# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") +# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") +# endif() + endif() + if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() + if (DEFINED HAVE_AVX) + if (NOT NO_AVX) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx") + endif() + endif() + if (DEFINED HAVE_AVX2) + if (NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + endif() + endif() + if (DEFINED HAVE_FMA3) + if (NOT NO_AVX2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") + endif() + endif() + if (DEFINED HAVE_SSE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") + endif() + if (DEFINED HAVE_SSE2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") + endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() + if (DEFINED HAVE_SSSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") + endif() + if (DEFINED HAVE_SSE4_1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") + endif() +endif() if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") endif () From a29338aaa6b364ce99ea30785d1227bd327ce3c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:27:42 +0100 Subject: [PATCH 040/121] Remove extraneous quotes that caused a cmake policy warning --- cmake/cc.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 2f4d1c6d7..b963940d6 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN") endif () endif () -if (${CORE} STREQUAL "SKYLAKEX") +if (${CORE} STREQUAL SKYLAKEX) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") @@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX") endif () endif () -if (${CORE} STREQUAL "COOPERLAKE") +if (${CORE} STREQUAL COOPERLAKE) if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) From ccb9731c7b41b601412b00b73f6da98613d66b7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:30:15 +0100 Subject: [PATCH 041/121] Fix propagation of cpu properties to compiler options --- Makefile.x86_64 | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 49a9a0a23..43bfc9ecd 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,9 +9,9 @@ endif endif ifdef HAVE_SSE3 -ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 +endif ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3 @@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif +ifdef HAVE_AVX +CCOMMON_OPT += -mavx +FCOMMON_OPT += -mavx endif +ifdef HAVE_AVX2 +CCOMMON_OPT += -mavx2 +FCOMMON_OPT += -mavx2 +endif +ifdef HAVE_FMA3 +CCOMMON_OPT += -mfma +FCOMMON_OPT += -mfma endif ifeq ($(CORE), SKYLAKEX) @@ -66,8 +76,7 @@ endif endif endif -ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) -ifndef DYNAMIC_ARCH +ifdef HAVE_AVX2 ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 @@ -96,7 +105,6 @@ endif endif endif endif -endif From a04f532edfe65a7e4cf4dfb2dc34d363e2eba065 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:37:03 +0100 Subject: [PATCH 042/121] Reset cpu property flags between build cycles in DYNAMIC_ARCH mode --- Makefile.system | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Makefile.system b/Makefile.system index ca302a98a..dc7ed3f3a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else +undefine HAVE_NEON +undefine HAVE_VFP +undefine HAVE_VFPV3 +undefine HAVE_VFPV4 +undefine HAVE_MMX +undefine HAVE_SSE +undefine HAVE_SSE2 +undefine HAVE_SSE3 +undefine HAVE_SSSE3 +undefine HAVE_SSE4_1 +undefine HAVE_SSE4_2 +undefine HAVE_SSE4A +undefine HAVE_SSE5 +undefine HAVE_AVX +undefine HAVE_AVX2 +undefine HAVE_FMA3 include $(TOPDIR)/Makefile_kernel.conf endif @@ -1522,6 +1538,8 @@ export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 export HAVE_AVX +export HAVE_AVX2 +export HAVE_FMA3 export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 From b976a0bf4095fd8b9e80ae3cf0e0f6eab200219e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 7 Nov 2020 20:39:56 +0100 Subject: [PATCH 043/121] Remove previous workaround for compiler flags related to cpu capabilities in x86_64 DYNAMIC_ARCH builds --- kernel/Makefile | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index e811ed43d..fb1d5d39a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,13 +5,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -ifdef HAVE_SSE3 -CFLAGS += -msse3 -endif -ifdef HAVE_SSSE3 -CFLAGS += -mssse3 -endif - ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as @@ -38,12 +31,6 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1 -endif - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON)) - override CFLAGS += -msse -msse2 -endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) From 6e364981a8af0f72ad9e62a69fe62fdedc18255b Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sat, 7 Nov 2020 15:21:58 -0600 Subject: [PATCH 044/121] Optimize sdot/ddot for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/KERNEL.POWER10 | 6 +- kernel/power/ddot_microk_power10.c | 131 ++++++++++++++++++++++++ kernel/power/ddot_power10.c | 130 ++++++++++++++++++++++++ kernel/power/sdot_microk_power10.c | 135 +++++++++++++++++++++++++ kernel/power/sdot_power10.c | 154 +++++++++++++++++++++++++++++ 5 files changed, 553 insertions(+), 3 deletions(-) create mode 100644 kernel/power/ddot_microk_power10.c create mode 100644 kernel/power/ddot_power10.c create mode 100644 kernel/power/sdot_microk_power10.c create mode 100644 kernel/power/sdot_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 28c39051f..c25cd9f04 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -151,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c CCOPYKERNEL = ccopy_power10.c ZCOPYKERNEL = zcopy_power10.c # -SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c -DSDOTKERNEL = sdot.c +SDOTKERNEL = sdot_power10.c +DDOTKERNEL = ddot_power10.c +DSDOTKERNEL = sdot_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CDOTKERNEL = cdot_power9.S else diff --git a/kernel/power/ddot_microk_power10.c b/kernel/power/ddot_microk_power10.c new file mode 100644 index 000000000..3a9865cc0 --- /dev/null +++ b/kernel/power/ddot_microk_power10.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static double ddot_kernel_8 (long n, double *x, double *y) +{ + double dot; + + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 50, 32(%3) \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 52, 64(%3) \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + XXSWAPD_S(33,32) + + "xsadddp %x0, 32, 33 \n" + + "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" + : + "=d" (dot), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" + ); + + return dot; +} diff --git a/kernel/power/ddot_power10.c b/kernel/power/ddot_power10.c new file mode 100644 index 000000000..302dceb68 --- /dev/null +++ b/kernel/power/ddot_power10.c @@ -0,0 +1,130 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "ddot_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + return dot; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + dot = ddot_kernel_8(n1, x, y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); + +} + + diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c new file mode 100644 index 000000000..2f028c5a0 --- /dev/null +++ b/kernel/power/sdot_microk_power10.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static float sdot_kernel_16 (long n, float *x, float *y) +{ + float dot; + + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 50, 32(%3) \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 52, 64(%3) \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 54, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" + : + "=f" (dot), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" + ); + + return dot; +} diff --git a/kernel/power/sdot_power10.c b/kernel/power/sdot_power10.c new file mode 100644 index 000000000..b61f0a90d --- /dev/null +++ b/kernel/power/sdot_power10.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "sdot_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + return dot; +} + +#endif + +#if defined (DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + +#if defined (DSDOT) + double mydot = 0.0; + FLOAT asmdot = 0.0; +#else + FLOAT mydot=0.0; +#endif + BLASLONG n1; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + n1 = n & (BLASLONG)(-32); + + if ( n1 ) +#if defined(DSDOT) + { + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG n2 = 32; + while (i Date: Sat, 7 Nov 2020 23:37:21 +0100 Subject: [PATCH 045/121] Update Makefile.system --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index dc7ed3f3a..258a84262 100644 --- a/Makefile.system +++ b/Makefile.system @@ -252,7 +252,9 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else +ifdef HAVE_NEON undefine HAVE_NEON +endif undefine HAVE_VFP undefine HAVE_VFPV3 undefine HAVE_VFPV4 From f6a57d8f63ed0f1fa4823d27daafc2cb3a6dc96b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Nov 2020 00:01:36 +0100 Subject: [PATCH 046/121] Update Makefile.system --- Makefile.system | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index 258a84262..da2d452b2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -255,9 +255,15 @@ else ifdef HAVE_NEON undefine HAVE_NEON endif +ifdef HAVE_VFP undefine HAVE_VFP +endif +ifdef HAVE_VFPV3 undefine HAVE_VFPV3 +endif +ifdef HAVE_VFPV4 undefine HAVE_VFPV4 +endif undefine HAVE_MMX undefine HAVE_SSE undefine HAVE_SSE2 From 1c4cfdc13937765dd9bd0ef8b846ba027ec086b3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Nov 2020 00:12:55 +0100 Subject: [PATCH 047/121] Stay compatible with old gmake that did not support undefine --- Makefile.system | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/Makefile.system b/Makefile.system index da2d452b2..aae7ba503 100644 --- a/Makefile.system +++ b/Makefile.system @@ -6,7 +6,7 @@ INCLUDED = 1 ifndef TOPDIR -TOPDIR = . +TOPDIR = . endif # If ARCH is not set, we use the host system's architecture for getarch compile options. @@ -252,30 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf else -ifdef HAVE_NEON -undefine HAVE_NEON -endif -ifdef HAVE_VFP -undefine HAVE_VFP -endif -ifdef HAVE_VFPV3 -undefine HAVE_VFPV3 -endif -ifdef HAVE_VFPV4 -undefine HAVE_VFPV4 -endif -undefine HAVE_MMX -undefine HAVE_SSE -undefine HAVE_SSE2 -undefine HAVE_SSE3 -undefine HAVE_SSSE3 -undefine HAVE_SSE4_1 -undefine HAVE_SSE4_2 -undefine HAVE_SSE4A -undefine HAVE_SSE5 -undefine HAVE_AVX -undefine HAVE_AVX2 -undefine HAVE_FMA3 +HAVE_NEON= +HAVE_VFP= +HAVE_VFPV3= +HAVE_VFPV4= +HAVE_MMX= +HAVE_SSE= +HAVE_SSE2= +HAVE_SSE3= +HAVE_SSSE3= +HAVE_SSE4_1= +HAVE_SSE4_2= +HAVE_SSE4A= +HAVE_SSE5= +HAVE_AVX= +HAVE_AVX2= +HAVE_FMA3= include $(TOPDIR)/Makefile_kernel.conf endif From ec088bf33aa3034a82b713ea304fe30e36c278ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 8 Nov 2020 13:15:40 +0100 Subject: [PATCH 048/121] Fix missing AVX2 and FMA3 capabilities in FORCE_target mode --- getarch.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/getarch.c b/getarch.c index ab90f36d9..daf669e56 100644 --- a/getarch.c +++ b/getarch.c @@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #endif @@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #else @@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif @@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" #else @@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ - "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" #define LIBNAME "cooperlake" #define CORENAME "COOPERLAKE" #endif @@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA3 -DFMA3" + "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "zen" #define CORENAME "ZEN" #endif From c4c591ac5afc10b5619d1c58b10d5095dc82a2ff Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 10 Nov 2020 16:16:38 +0800 Subject: [PATCH 049/121] fix sum optimize issues --- kernel/arm/sum.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index 63584b95c..a486a1868 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) n *= inc_x; if (inc_x == 1) { -#if V_SIMD +#if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128)) #ifdef DOUBLE const int vstep = v_nlanes_f64; - const int unrollx2 = n & (-vstep * 2); + const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; v_f64 vsum0 = v_zero_f64(); v_f64 vsum1 = v_zero_f64(); - while (i < unrollx2) + v_f64 vsum2 = v_zero_f64(); + v_f64 vsum3 = v_zero_f64(); + for (; i < unrollx4; i += vstep * 4) { - vsum0 = v_add_f64(vsum0, v_loadu_f64(x)); - vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep)); - i += vstep * 2; + vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); + vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep)); + vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2)); + vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3)); } - vsum0 = v_add_f64(vsum0, vsum1); - while (i < unrollx) + vsum0 = v_add_f64( + v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3)); + for (; i < unrollx; i += vstep) { vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); - i += vstep; } sumf = v_sum_f64(vsum0); #else @@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_f32 vsum1 = v_zero_f32(); v_f32 vsum2 = v_zero_f32(); v_f32 vsum3 = v_zero_f32(); - while (i < unrollx4) + for (; i < unrollx4; i += vstep * 4) { - vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); - vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); - vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); - vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); - i += vstep * 4; + vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); + vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep)); + vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2)); + vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3)); } vsum0 = v_add_f32( v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); - while (i < unrollx) + for (; i < unrollx; i += vstep) { vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); - i += vstep; } sumf = v_sum_f32(vsum0); #endif From 8c0b206d4cf9909017a52919a41406ee303f472e Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 11 Nov 2020 14:33:12 +0800 Subject: [PATCH 050/121] Optimize the performance of rot by using universal intrinsics --- kernel/simd/intrin_avx.h | 10 ++++++ kernel/simd/intrin_avx512.h | 5 +++ kernel/simd/intrin_neon.h | 10 ++++++ kernel/simd/intrin_sse.h | 13 +++++++ kernel/x86_64/drot.c | 68 ++++++++++++++++++++++++++++++++++- kernel/x86_64/srot.c | 70 ++++++++++++++++++++++++++++++++++++- 6 files changed, 174 insertions(+), 2 deletions(-) diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 3f79646e0..fbe531417 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -12,6 +12,8 @@ typedef __m256d v_f64; ***************************/ #define v_add_f32 _mm256_add_ps #define v_add_f64 _mm256_add_pd +#define v_sub_f32 _mm256_sub_ps +#define v_sub_f64 _mm256_sub_pd #define v_mul_f32 _mm256_mul_ps #define v_mul_f64 _mm256_mul_pd @@ -19,12 +21,20 @@ typedef __m256d v_f64; // multiply and add, a*b + c #define v_muladd_f32 _mm256_fmadd_ps #define v_muladd_f64 _mm256_fmadd_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm256_fmsub_ps + #define v_mulsub_f64 _mm256_fmsub_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return v_add_f64(v_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_sub_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_sub_f64(v_mul_f64(a, b), c); } #endif // !HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index f00af53e9..8f38eedd9 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -12,11 +12,16 @@ typedef __m512d v_f64; ***************************/ #define v_add_f32 _mm512_add_ps #define v_add_f64 _mm512_add_pd +#define v_sub_f32 _mm512_sub_ps +#define v_sub_f64 _mm512_sub_pd #define v_mul_f32 _mm512_mul_ps #define v_mul_f64 _mm512_mul_pd // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps #define v_muladd_f64 _mm512_fmadd_pd +// multiply and subtract, a*b - c +#define v_mulsub_f32 _mm512_fmsub_ps +#define v_mulsub_f64 _mm512_fmsub_pd BLAS_FINLINE float v_sum_f32(v_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 22cef10ca..cd44599fe 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -18,6 +18,8 @@ typedef float32x4_t v_f32; ***************************/ #define v_add_f32 vaddq_f32 #define v_add_f64 vaddq_f64 +#define v_sub_f32 vsubq_f32 +#define v_sub_f64 vsubq_f64 #define v_mul_f32 vmulq_f32 #define v_mul_f64 vmulq_f64 @@ -26,16 +28,24 @@ typedef float32x4_t v_f32; // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return vfmaq_f32(c, a, b); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return vfmaq_f32(vnegq_f32(c), a, b); } #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return vmlaq_f32(c, a, b); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return vmlaq_f32(vnegq_f32(c), a, b); } #endif // FUSED F64 #if V_SIMD_F64 BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return vfmaq_f64(c, a, b); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return vfmaq_f64(vnegq_f64(c), a, b); } #endif // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 06a3fe78b..6a542072e 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -12,22 +12,35 @@ typedef __m128d v_f64; ***************************/ #define v_add_f32 _mm_add_ps #define v_add_f64 _mm_add_pd +#define v_sub_f32 _mm_sub_ps +#define v_sub_f64 _mm_sub_pd #define v_mul_f32 _mm_mul_ps #define v_mul_f64 _mm_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm_fmadd_ps #define v_muladd_f64 _mm_fmadd_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm_fmsub_ps + #define v_mulsub_f64 _mm_fmsub_pd #elif defined(HAVE_FMA4) // multiply and add, a*b + c #define v_muladd_f32 _mm_macc_ps #define v_muladd_f64 _mm_macc_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm_msub_ps + #define v_mulsub_f64 _mm_msub_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return v_add_f64(v_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_sub_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_sub_f64(v_mul_f64(a, b), c); } #endif // HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index a312b7ff9..66e9ff907 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -7,10 +7,76 @@ #endif #ifndef HAVE_DROT_KERNEL +#include "../simd/intrin.h" static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; +#if V_SIMD_F64 && V_SIMD > 256 + const int vstep = v_nlanes_f64; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + + v_f64 __c = v_setall_f64(c); + v_f64 __s = v_setall_f64(s); + v_f64 vx0, vx1, vx2, vx3; + v_f64 vy0, vy1, vy2, vy3; + v_f64 vt0, vt1, vt2, vt3; + + for (; i < unrollx4; i += vstep * 4) { + vx0 = v_loadu_f64(x + i); + vx1 = v_loadu_f64(x + i + vstep); + vx2 = v_loadu_f64(x + i + vstep * 2); + vx3 = v_loadu_f64(x + i + vstep * 3); + vy0 = v_loadu_f64(y + i); + vy1 = v_loadu_f64(y + i + vstep); + vy2 = v_loadu_f64(y + i + vstep * 2); + vy3 = v_loadu_f64(y + i + vstep * 3); + + vt0 = v_mul_f64(__s, vy0); + vt1 = v_mul_f64(__s, vy1); + vt2 = v_mul_f64(__s, vy2); + vt3 = v_mul_f64(__s, vy3); + + vt0 = v_muladd_f64(__c, vx0, vt0); + vt1 = v_muladd_f64(__c, vx1, vt1); + vt2 = v_muladd_f64(__c, vx2, vt2); + vt3 = v_muladd_f64(__c, vx3, vt3); + + v_storeu_f64(x + i, vt0); + v_storeu_f64(x + i + vstep, vt1); + v_storeu_f64(x + i + vstep * 2, vt2); + v_storeu_f64(x + i + vstep * 3, vt3); + + vt0 = v_mul_f64(__s, vx0); + vt1 = v_mul_f64(__s, vx1); + vt2 = v_mul_f64(__s, vx2); + vt3 = v_mul_f64(__s, vx3); + + vt0 = v_mulsub_f64(__c, vy0, vt0); + vt1 = v_mulsub_f64(__c, vy1, vt1); + vt2 = v_mulsub_f64(__c, vy2, vt2); + vt3 = v_mulsub_f64(__c, vy3, vt3); + + v_storeu_f64(y + i, vt0); + v_storeu_f64(y + i + vstep, vt1); + v_storeu_f64(y + i + vstep * 2, vt2); + v_storeu_f64(y + i + vstep * 3, vt3); + } + + for (; i < unrollx; i += vstep) { + vx0 = v_loadu_f64(x + i); + vy0 = v_loadu_f64(y + i); + + vt0 = v_mul_f64(__s, vy0); + vt0 = v_muladd_f64(__c, vx0, vt0); + v_storeu_f64(x + i, vt0); + + vt0 = v_mul_f64(__s, vx0); + vt0 = v_mulsub_f64(__c, vy0, vt0); + v_storeu_f64(y + i, vt0); + } +#else FLOAT f0, f1, f2, f3; FLOAT x0, x1, x2, x3; FLOAT g0, g1, g2, g3; @@ -53,7 +119,7 @@ static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) yp += 4; i += 4; } - +#endif while (i < n) { FLOAT temp = c*x[i] + s*y[i]; y[i] = c*y[i] - s*x[i]; diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 021c20d82..d9583cdfa 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -7,10 +7,78 @@ #endif #ifndef HAVE_SROT_KERNEL +#include"../simd/intrin.h" static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; + +#if V_SIMD + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + + v_f32 __c = v_setall_f32(c); + v_f32 __s = v_setall_f32(s); + v_f32 vx0, vx1, vx2, vx3; + v_f32 vy0, vy1, vy2, vy3; + v_f32 vt0, vt1, vt2, vt3; + + for (; i < unrollx4; i += vstep * 4) { + vx0 = v_loadu_f32(x + i); + vx1 = v_loadu_f32(x + i + vstep); + vx2 = v_loadu_f32(x + i + vstep * 2); + vx3 = v_loadu_f32(x + i + vstep * 3); + vy0 = v_loadu_f32(y + i); + vy1 = v_loadu_f32(y + i + vstep); + vy2 = v_loadu_f32(y + i + vstep * 2); + vy3 = v_loadu_f32(y + i + vstep * 3); + + vt0 = v_mul_f32(__s, vy0); + vt1 = v_mul_f32(__s, vy1); + vt2 = v_mul_f32(__s, vy2); + vt3 = v_mul_f32(__s, vy3); + + vt0 = v_muladd_f32(__c, vx0, vt0); + vt1 = v_muladd_f32(__c, vx1, vt1); + vt2 = v_muladd_f32(__c, vx2, vt2); + vt3 = v_muladd_f32(__c, vx3, vt3); + + v_storeu_f32(x + i, vt0); + v_storeu_f32(x + i + vstep, vt1); + v_storeu_f32(x + i + vstep * 2, vt2); + v_storeu_f32(x + i + vstep * 3, vt3); + + vt0 = v_mul_f32(__s, vx0); + vt1 = v_mul_f32(__s, vx1); + vt2 = v_mul_f32(__s, vx2); + vt3 = v_mul_f32(__s, vx3); + + vt0 = v_mulsub_f32(__c, vy0, vt0); + vt1 = v_mulsub_f32(__c, vy1, vt1); + vt2 = v_mulsub_f32(__c, vy2, vt2); + vt3 = v_mulsub_f32(__c, vy3, vt3); + + v_storeu_f32(y + i, vt0); + v_storeu_f32(y + i + vstep, vt1); + v_storeu_f32(y + i + vstep * 2, vt2); + v_storeu_f32(y + i + vstep * 3, vt3); + + } + + for (; i < unrollx; i += vstep) { + vx0 = v_loadu_f32(x + i); + vy0 = v_loadu_f32(y + i); + + vt0 = v_mul_f32(__s, vy0); + vt0 = v_muladd_f32(__c, vx0, vt0); + v_storeu_f32(x + i, vt0); + + vt0 = v_mul_f32(__s, vx0); + vt0 = v_mulsub_f32(__c, vy0, vt0); + v_storeu_f32(y + i, vt0); + } +#else FLOAT f0, f1, f2, f3; FLOAT x0, x1, x2, x3; FLOAT g0, g1, g2, g3; @@ -20,7 +88,6 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) FLOAT* yp = y; BLASLONG n1 = n & (~7); - while (i < n1) { x0 = xp[0]; y0 = yp[0]; @@ -53,6 +120,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) yp += 4; i += 4; } +#endif while (i < n) { FLOAT temp = c*x[i] + s*y[i]; From 5bc0a7583fed3328f176b69419ae12a063f2f4e0 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 11 Nov 2020 15:18:01 +0800 Subject: [PATCH 051/121] only FMA3 and vector larger than 128 have positive effects. --- kernel/x86_64/srot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index d9583cdfa..4273f7fe7 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; -#if V_SIMD +#if V_SIMD && (HAVE_FMA3 || V_SIMD > 128) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; From a87e537b8cd5844159dd5806204470a945be695d Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Wed, 11 Nov 2020 15:53:48 +0800 Subject: [PATCH 052/121] modify macro --- kernel/x86_64/srot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 4273f7fe7..3de586cb8 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; -#if V_SIMD && (HAVE_FMA3 || V_SIMD > 128) +#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; From e5c2ceb6750c4e649aef87e06bd87ed4fcbdc6a5 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 12 Nov 2020 17:35:17 +0800 Subject: [PATCH 053/121] fix the CI failure of lack the head --- kernel/simd/intrin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index ef8fcb865..3802a91e1 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -47,7 +47,7 @@ extern "C" { #endif /** AVX **/ -#ifdef HAVE_AVX +#if defined(HAVE_AVX) || defined(HAVE_FMA3) #include #endif From e0dac6b53b27b2d79404577d17fdee8b2303e123 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 12 Nov 2020 20:31:03 +0800 Subject: [PATCH 054/121] fix the CI failure of target specific option mismatch --- kernel/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index fb1d5d39a..fd9105fee 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,10 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +ifdef HAVE_FMA3 +CFLAGS += -mfma +endif + ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as From ae0b1dea19bf836fb0c8af3630ccfcbbf4b8e37f Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Fri, 13 Nov 2020 10:20:24 +0800 Subject: [PATCH 055/121] modify system.cmake to enable fma flag --- cmake/system.cmake | 2 +- kernel/Makefile | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 66e95c6d3..68df2d900 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -174,7 +174,7 @@ if (DEFINED TARGET) endif() if (DEFINED HAVE_AVX) if (NOT NO_AVX) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx -mfma") endif() endif() if (DEFINED HAVE_AVX2) diff --git a/kernel/Makefile b/kernel/Makefile index fd9105fee..fb1d5d39a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,10 +5,6 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system -ifdef HAVE_FMA3 -CFLAGS += -mfma -endif - ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as From d6e7e05bb36d77f26274abf7d8be03dd2bd78c1d Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Fri, 13 Nov 2020 14:20:52 +0800 Subject: [PATCH 056/121] Improve the performance of dasum and sasum when SMP is defined --- kernel/x86_64/dasum.c | 66 +++++++++++++++++++++++++++++++++++++------ kernel/x86_64/sasum.c | 59 ++++++++++++++++++++++++++++++++++---- 2 files changed, 110 insertions(+), 15 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 8a40ea4b9..ddec21383 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) } #endif - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return (sumf); - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) { + if (inc_x == 1) { sumf = dasum_kernel(n, x); - } + } else { n *= inc_x; - - while(i < n) { + while (i < n) { sumf += ABS_K(x[i]); i += inc_x; } @@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) return(sumf); } +#if defined(SMP) +static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) +{ + *(FLOAT *)result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; + FLOAT * dummy_b; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 100000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/100000 ? num_cpu : n/100000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} + diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 36ec4a737..d0cea9bee 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) #endif -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return (sumf); - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) { + if (inc_x == 1) { sumf = sasum_kernel(n, x); } else { - n *= inc_x; while(i < n) { sumf += ABS_K(x[i]); i += inc_x; } + } + return (sumf); +} +#if defined(SMP) +static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) +{ + *(FLOAT *)result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 100000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/100000 ? num_cpu : n/100000; + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT * ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif return(sumf); } From ec4d77c47c46358521c3b38e42eb8bfebcb94ec3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Nov 2020 09:16:34 +0100 Subject: [PATCH 057/121] Add -mfma for HAVE_FMA3 in the non-DYNAMIC_ARCH case as well --- cmake/cc.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index b963940d6..76952152b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -124,6 +124,9 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () + if (HAVE_FMA3) + set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") + endif () if (HAVE_SSE) set (CCOMMON_OPT "${CCOMMON_OPT} -msse") endif () From b00a0de1323732a1b82c15bc4f0b0bac3e01c262 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 16 Nov 2020 09:14:56 +0800 Subject: [PATCH 058/121] remove the -mfma flag in when the host has AVX. --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 68df2d900..66e95c6d3 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -174,7 +174,7 @@ if (DEFINED TARGET) endif() if (DEFINED HAVE_AVX) if (NOT NO_AVX) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx -mfma") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx") endif() endif() if (DEFINED HAVE_AVX2) From fdf71d66b3799f730bae282edf84345ccdf7c21b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 19 Nov 2020 20:50:42 +1100 Subject: [PATCH 059/121] POWER10: Fix ld version detection LDVERSIONGTEQ35 needs to escape the '>' character. LDVERSIONGTEQ35 is checking the system ld version which may be different to the toolchain being used to compile OpenBLAS. We don't have a path to the linker in our Makefiles, so (ab)use gcc -Wl,--version to get the version of ld in our toolchain. --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index aae7ba503..6ee8beff8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -672,7 +672,7 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) +LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT From 043f3d6faa797e0fe79c165b0a31acf0cf8f2b38 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 19 Nov 2020 21:04:10 +1100 Subject: [PATCH 060/121] POWER10: Use POWER9 as a fallback If the toolchain is too old, or the mma features isn't set on a POWER10 fall back to the POWER9 loops. --- driver/others/dynamic_power.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 85fc5b3ba..d60ae68fc 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -52,6 +52,9 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; return NULL; } From 60005eb47b5d30dcf35edff8c824a9f9fd9f6e6c Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 19 Nov 2020 14:39:00 +0100 Subject: [PATCH 061/121] Don't overwrite blas_thread_buffer if already set After a fork it is possible that blas_thread_buffer has already allocated memory buffers: goto_set_num_threads does allocate those already and it may be called by num_cpu_avail in case the OpenBLAS NUM_THREADS differ from the OMP num threads. This leads to a memory leak which can cause subsequent execution of BLAS kernels to fail. Fixes #2993 --- driver/others/blas_server_omp.c | 48 +++++++++++++++------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index a8b3e9a4b..a576127aa 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -76,10 +76,28 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #endif -void goto_set_num_threads(int num_threads) { +static void adjust_thread_buffers() { int i=0, j=0; + //adjust buffer for each thread + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { + for(j=0; j < blas_cpu_number; j++){ + if(blas_thread_buffer[i][j] == NULL){ + blas_thread_buffer[i][j] = blas_memory_alloc(2); + } + } + for(; j < MAX_CPU_NUMBER; j++){ + if(blas_thread_buffer[i][j] != NULL){ + blas_memory_free(blas_thread_buffer[i][j]); + blas_thread_buffer[i][j] = NULL; + } + } + } +} + +void goto_set_num_threads(int num_threads) { + if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -92,20 +110,7 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); - //adjust buffer for each thread - for(i=0; i Date: Thu, 19 Nov 2020 15:24:57 +0100 Subject: [PATCH 062/121] Add reproducer test for crash after fork See #2993 for an analysis --- utest/CMakeLists.txt | 6 +- utest/Makefile | 3 +- utest/test_fork.c | 4 +- utest/test_post_fork.c | 131 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 4 deletions(-) create mode 100644 utest/test_post_fork.c diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dc5175fc5..357e61301 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -27,13 +27,17 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) +if (NOT USE_OPENMP) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_fork.c ) endif() +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} + test_post_fork.c + ) endif() if (NOT NO_LAPACK) diff --git a/utest/Makefile b/utest/Makefile index 31d4ccf00..ac8c6f72a 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -25,10 +25,11 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) +ifneq ($(USE_OPENMP), 1) OBJS += test_fork.o endif +OBJS += test_post_fork.o endif ifeq ($(C_COMPILER), PGI) diff --git a/utest/test_fork.c b/utest/test_fork.c index 5c976f920..bd531e7fb 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "openblas_utest.h" -void* xmalloc(size_t n) +static void* xmalloc(size_t n) { void* tmp; tmp = malloc(n); @@ -49,7 +49,7 @@ void* xmalloc(size_t n) } #ifdef BUILD_DOUBLE -void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) +static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { char trans1 = 'T'; char trans2 = 'N'; diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c new file mode 100644 index 000000000..9370a02ce --- /dev/null +++ b/utest/test_post_fork.c @@ -0,0 +1,131 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include +#include +#ifdef USE_OPENMP +#include +#endif +#include "openblas_utest.h" + +static void* xmalloc(size_t n) +{ + void* tmp; + tmp = malloc(n); + if (tmp == NULL) { + fprintf(stderr, "You are about to die\n"); + exit(1); + } else { + return tmp; + } +} + +#ifdef BUILD_DOUBLE +static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) +{ + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + int i; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); + for(i = 0; i < n * n; ++i) { + ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); + } +} +#endif + +CTEST(fork, safety_after_fork_in_parent) +{ +#ifndef BUILD_DOUBLE +exit(0); +#else + blasint n = 100; + int i, nthreads_omp; + + double *a, *b, *c, *d; + size_t n_bytes; + + pid_t fork_pid; + + n_bytes = sizeof(*a) * n * n; + + a = xmalloc(n_bytes); + b = xmalloc(n_bytes); + c = xmalloc(n_bytes); + d = xmalloc(n_bytes); + + // Put ones in a, b and n in c (result) + for(i = 0; i < n * n; ++i) { + a[i] = 1; + b[i] = 1; + c[i] = 1 * n; + } + + // Test that OpenBLAS works after a fork. + // This situation routinely happens with Pythons numpy where a + // `sys.platform` calls `uname` in a forked process. + // So we simulate this situation here. + + // There was an issue where a different number of OpenBLAS and OpenMP + // threads triggered a memory leak. So run this multiple times + // with different number of threads set. +#ifdef USE_OPENMP + nthreads_omp = omp_get_max_threads(); + // Run with half the max OMP threads, the max threads and twice that + for(i = (nthreads_omp + 1) / 2; i <= nthreads_omp * 2; i *= 2) { + omp_set_num_threads(i); +#endif + + fork_pid = fork(); + if (fork_pid == -1) { + CTEST_ERR("Failed to fork process."); + } else if (fork_pid == 0) { + // Just pretend to do something, e.g. call `uname`, then exit + exit(0); + } else { + // Wait for the child to finish and check the exit code. + int child_status = 0; + pid_t wait_pid = wait(&child_status); + ASSERT_EQUAL(wait_pid, fork_pid); + ASSERT_EQUAL(0, WEXITSTATUS (child_status)); + + // Now OpenBLAS has to work + check_dgemm(a, b, d, c, n); + } +#ifdef USE_OPENMP + } +#endif + +#endif +} From c6c9c24d1b64430033e733c7341a5d37c79e4668 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sun, 22 Nov 2020 16:02:19 +0800 Subject: [PATCH 063/121] Update doc for C910. --- README.md | 7 +++++++ TargetList.txt | 2 ++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index ca034e747..267df5358 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Z13**: Optimized Level-3 BLAS and Level-1,2 - **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2 +#### RISC-V + +- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. + ```sh + make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index 86177ebca..d19964916 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -107,3 +107,5 @@ Z14 10.RISC-V 64: RISCV64_GENERIC +C910V + From 8a6b17f97dae84fe935d049761399b4dac59652e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:19:31 +0100 Subject: [PATCH 064/121] Change ifndefs to ifneq --- ctest/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index cba904f75..2a893cae8 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -61,7 +61,7 @@ endif all1: $(all1targets) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat1 @@ -106,7 +106,7 @@ endif all2: $(all2targets) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 @@ -152,7 +152,7 @@ endif all3: $(all3targets) -ifndef CROSS +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 From 65eb7afaf42450f3073bfc89ed4029e2ee21d61f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:25:36 +0100 Subject: [PATCH 065/121] Change ifndef CROSS to ifneq --- test/Makefile | 360 +++++++++++++++++++------------------------------- 1 file changed, 133 insertions(+), 227 deletions(-) diff --git a/test/Makefile b/test/Makefile index 1ecce0be7..2a893cae8 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,269 +1,211 @@ -TOPDIR = .. -include ../Makefile.system +# +# The Makefile compiles c wrappers and testers for CBLAS. +# + +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= + +LIB = $(TOPDIR)/$(LIBNAME) + +stestl1o = c_sblas1.o + +stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o + +stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o + +dtestl1o = c_dblas1.o + +dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o + +dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o + +ctestl1o = c_cblas1.o + +ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o + +ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o + +ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o + +ztestl1o = c_zblas1.o + +ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o + +ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o + +ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o ifeq ($(NOFORTRAN),1) all :: else -all :: level1 level2 level3 +all :: all1 all2 all3 endif ifeq ($(BUILD_SINGLE),1) -S1=sblat1 +all1targets += xscblat1 endif ifeq ($(BUILD_DOUBLE),1) -D1=dblat1 +all1targets += xdcblat1 endif ifeq ($(BUILD_COMPLEX),1) -C1=cblat1 +all1targets += xccblat1 endif ifeq ($(BUILD_COMPLEX16),1) -Z1=zblat1 +all1targets += xzcblat1 endif -level1: $(S1) $(D1) $(C1) $(Z1) +all1: $(all1targets) -ifndef CROSS -ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 -endif -ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 -endif -ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 -endif -ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1 -endif -ifdef SMP +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./sblat1 + OMP_NUM_THREADS=2 ./xscblat1 endif ifeq ($(BUILD_DOUBLE),1) - OMP_NUM_THREADS=2 ./dblat1 -endif + OMP_NUM_THREADS=2 ./xdcblat1 +endif ifeq ($(BUILD_COMPLEX),1) - OMP_NUM_THREADS=2 ./cblat1 + OMP_NUM_THREADS=2 ./xccblat1 endif ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./zblat1 + OMP_NUM_THREADS=2 ./xzcblat1 endif else ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=2 ./sblat1 + OPENBLAS_NUM_THREADS=2 ./xscblat1 endif ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=2 ./dblat1 + OPENBLAS_NUM_THREADS=2 ./xdcblat1 endif ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./cblat1 + OPENBLAS_NUM_THREADS=2 ./xccblat1 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./zblat1 -endif + OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif endif endif ifeq ($(BUILD_SINGLE),1) -S2=sblat2 +all2targets += xscblat2 endif ifeq ($(BUILD_DOUBLE),1) -D2=dblat2 +all2targets += xdcblat2 endif ifeq ($(BUILD_COMPLEX),1) -C2=cblat2 +all2targets += xccblat2 endif ifeq ($(BUILD_COMPLEX16),1) -Z2=zblat2 +all2targets += xzcblat2 endif -level2: $(S2) $(D2) $(C2) $(Z2) +all2: $(all2targets) - -ifndef CROSS - rm -f ?BLAT2.SUMM -ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat - @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 -endif -ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat - @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 -endif -ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat - @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 -endif -ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat - @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 -endif -ifdef SMP - rm -f ?BLAT2.SUMM +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat - @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xscblat2 < sin2 endif ifeq ($(BUILD_DOUBLE),1) - OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat - @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xdcblat2 < din2 endif ifeq ($(BUILD_COMPLEX),1) - OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat - @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xccblat2 < cin2 endif ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat - @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xzcblat2 < zin2 endif else ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat - @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2 endif ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat - @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2 endif ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat - @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat - @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 -endif + OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif endif endif -ifeq ($(BUILD_BFLOAT16),1) -B3= test_sbgemm -endif + ifeq ($(BUILD_SINGLE),1) -S3=sblat3 +all3targets += xscblat3 endif ifeq ($(BUILD_DOUBLE),1) -D3=dblat3 +all3targets += xdcblat3 endif ifeq ($(BUILD_COMPLEX),1) -C3=cblat3 +all3targets += xccblat3 endif ifeq ($(BUILD_COMPLEX16),1) -Z3=zblat3 +all3targets += xzcblat3 endif -level3: $(B3) $(S3) $(D3) $(C3) $(Z3) - +all3: $(all3targets) -ifndef CROSS - rm -f ?BLAT3.SUMM -ifeq ($(BUILD_BFLOAT16),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM - @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 -endif -ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat - @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 -endif -ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat - @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 -endif -ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat - @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 -endif -ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat - @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 -endif -ifdef SMP - rm -f ?BLAT3.SUMM +ifneq ($(CROSS), 1) ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_BFLOAT16),1) - OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM - @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 -endif ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat - @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xscblat3 < sin3 endif ifeq ($(BUILD_DOUBLE),1) - OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat - @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xdcblat3 < din3 endif ifeq ($(BUILD_COMPLEX),1) - OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat - @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xccblat3 < cin3 endif ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat - @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./xzcblat3 < zin3 endif else -ifeq ($(BUILD_BFLOAT16),1) - OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM - @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 -endif ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat - @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3 endif ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat - @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3 endif ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat - @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat - @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 endif endif endif -endif - -level3_3m : zblat3_3m cblat3_3m -ifndef CROSS - rm -f ?BLAT3_3M.SUMM - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat - @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat - @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 -ifdef SMP - rm -f ?BLAT3_3M.SUMM +all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(USE_OPENMP), 1) - OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat - @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 - OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat - @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +ifeq ($(BUILD_SINGLE),1) + OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m +endif +ifeq ($(BUILD_COMPLEX16),1) + OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m +endif else - OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat - @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 - OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat - @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +ifeq ($(BUILD_COMPLEX),1) + OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m endif +ifeq ($(BUILD_COMPLEX16),1) + OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif -FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) - -ifeq ($(CORE), C910V) -EXTRALIB = -CEXTRALIB = -endif +clean :: + rm -f x* +FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) @@ -273,90 +215,54 @@ endif endif ifeq ($(BUILD_SINGLE),1) -sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +# Single real +xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_DOUBLE),1) -dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -else -dblat2: -dblat3: +# Double real +xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) endif -qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - ifeq ($(BUILD_COMPLEX),1) -cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -endif - -ifeq ($(BUILD_COMPLEX16),1) -zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +# Single complex +xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) endif -ifeq ($(BUILD_BFLOAT16),1) -test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -endif - -ifeq ($(BUILD_COMPLEX),1) -cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -endif ifeq ($(BUILD_COMPLEX16),1) -zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -endif +# Double complex +xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif -clean: - @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ - sblat1 dblat1 cblat1 zblat1 \ - sblat2 dblat2 cblat2 zblat2 \ - test_sbgemm sblat3 dblat3 cblat3 zblat3 \ - sblat1p dblat1p cblat1p zblat1p \ - sblat2p dblat2p cblat2p zblat2p \ - sblat3p dblat3p cblat3p zblat3p \ - zblat3_3m zblat3_3mp \ - cblat3_3m cblat3_3mp \ - *.stackdump *.dll - -libs: - -prof: - -quick : - $(MAKE) -C $(TOPDIR) libs - -# include ../Makefile.tail +include $(TOPDIR)/Makefile.tail From d3ff1f889fad96bf20cc3536bfab1c9ac58f4056 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:27:17 +0100 Subject: [PATCH 066/121] Convert ifndefs to ifneq --- driver/level3/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index b528dfa2d..78f32b961 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -206,7 +206,7 @@ ifdef SMP COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(SUFFIX) gemm_thread_variable.$(SUFFIX) COMMONOBJS += syrk_thread.$(SUFFIX) -ifndef USE_SIMPLE_THREADED_LEVEL3 +ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) ifeq ($(BUILD_BFLOAT16),1) SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif @@ -282,7 +282,7 @@ HPLOBJS = \ dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) -ifndef USE_SIMPLE_THREADED_LEVEL3 +ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \ dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) endif @@ -297,13 +297,13 @@ ifeq ($(BUILD_DOUBLE),1) strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \ ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) -ifndef USE_SIMPLE_THREADED_LEVEL3 +ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) SBLASOBJS += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX) endif endif ifeq ($(BUILD_COMPLEX),1) SBLASOBJS = sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) -ifndef USE_SIMPLE_THREADED_LEVEL3 +ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) endif endif @@ -312,7 +312,7 @@ ifneq ($(BUILD_DOUBLE),1) DBLASOBJS= ifeq ($(BUILD_COMPLEX16),1) DBLASOBJS = dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) -ifndef USE_SIMPLE_THREADED_LEVEL3 +ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) endif endif @@ -332,7 +332,7 @@ ifeq ($(BUILD_COMPLEX16),1) ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \ ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \ ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) -ifndef USE_SIMPLE_THREADED_LEVEL3 +ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1) CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX) endif endif From 5fa305172a610264747cf6324bce639c67b3a7b9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:29:56 +0100 Subject: [PATCH 067/121] Use ifeq instead of ifdef for user-definable options --- driver/others/Makefile | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/driver/others/Makefile b/driver/others/Makefile index 7558ec058..d09444f56 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -7,7 +7,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) ifdef SMP COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) -ifndef NO_AFFINITY +ifneq ($(NO_AFFINITY), 1) COMMONOBJS += init.$(SUFFIX) endif endif @@ -32,11 +32,11 @@ else COMMONOBJS += parameter.$(SUFFIX) endif -ifdef EXPRECISION +ifeq ($(EXPRECISION), 1) COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) endif -ifdef QUAD_PRECISION +ifeq ($(QUAD_PRECISION), 1) COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) endif @@ -46,11 +46,9 @@ ifeq ($(C_COMPILER), PGI) endif endif -ifdef USE_CUDA ifeq ($(USE_CUDA), 1) COMMONOBJS += cuda_init.$(SUFFIX) endif -endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) From 857afcc41d695cf6ed0279d8476bad50e0e9fdf3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:31:44 +0100 Subject: [PATCH 068/121] Use ifeq instead of ifdef for user-definable build options --- interface/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 7b0bf1792..597956fdb 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -19,7 +19,7 @@ ifeq ($(ARCH), MIPS) SUPPORT_GEMM3M = 1 endif -ifndef NO_FBLAS +ifneq ($(NO_FBLAS), 1) SBLAS1OBJS = \ saxpy.$(SUFFIX) sswap.$(SUFFIX) \ @@ -146,7 +146,7 @@ ZBLAS3OBJS += zgemm3m.$(SUFFIX) endif -ifdef EXPRECISION +ifeq ($(EXPRECISION), 1) QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ @@ -511,11 +511,11 @@ endif FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -ifdef EXPRECISION +ifeq ($(EXPRECISION), 1) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif -ifdef QUAD_PRECISION +ifeq ($(QUAD_PRECISION), 1) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif From ebb8788696a61adba6819c08f323a68e8d2c43c8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:33:34 +0100 Subject: [PATCH 069/121] Use ifneq instead of ifdef for CROSS option --- utest/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/Makefile b/utest/Makefile index ac8c6f72a..1fc30d088 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -45,7 +45,7 @@ $(UTESTBIN): $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) run_test: $(UTESTBIN) -ifndef CROSS +ifneq ($(CROSS), 1) ./$(UTESTBIN) endif From 60e1fddca7634917a56bcc4cb43bbbee08eb136a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 16:48:22 +0100 Subject: [PATCH 070/121] Ensure that the same (large) BUFFERSIZE is used for all cpus in DYNAMIC_ARCH builds --- common_power.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_power.h b/common_power.h index a61e4e28a..6fb2af30a 100644 --- a/common_power.h +++ b/common_power.h @@ -849,6 +849,10 @@ Lmcount$lazy_ptr: #else #define BUFFER_SIZE ( 16 << 20) #endif +#ifeq ($(DYNAMIC_ARCH), 1) +#undefine BUFFER_SIZE +#define BUFFER_SIZE (64 << 22) +#endif #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) From 2b114c3f30ff70c23fbbe3215e62f83fadb70f9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 17:16:22 +0100 Subject: [PATCH 071/121] Restore proper Makefile --- test/Makefile | 354 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 224 insertions(+), 130 deletions(-) diff --git a/test/Makefile b/test/Makefile index 2a893cae8..5f653414a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,211 +1,269 @@ -# -# The Makefile compiles c wrappers and testers for CBLAS. -# - -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -override CFLAGS += -DADD$(BU) -DCBLAS -override TARGET_ARCH= -override TARGET_MACH= - -LIB = $(TOPDIR)/$(LIBNAME) - -stestl1o = c_sblas1.o - -stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o - -stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o - -dtestl1o = c_dblas1.o - -dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o - -dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o - -ctestl1o = c_cblas1.o - -ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o - -ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o - -ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o - -ztestl1o = c_zblas1.o - -ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o - -ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o - -ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o +TOPDIR = .. +include ../Makefile.system ifeq ($(NOFORTRAN),1) all :: else -all :: all1 all2 all3 +all :: level1 level2 level3 endif ifeq ($(BUILD_SINGLE),1) -all1targets += xscblat1 +S1=sblat1 endif ifeq ($(BUILD_DOUBLE),1) -all1targets += xdcblat1 +D1=dblat1 endif ifeq ($(BUILD_COMPLEX),1) -all1targets += xccblat1 +C1=cblat1 endif ifeq ($(BUILD_COMPLEX16),1) -all1targets += xzcblat1 +Z1=zblat1 endif -all1: $(all1targets) +level1: $(S1) $(D1) $(C1) $(Z1) ifneq ($(CROSS), 1) +ifeq ($(BUILD_SINGLE),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1 +endif +ifdef SMP ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./xscblat1 + OMP_NUM_THREADS=2 ./sblat1 endif ifeq ($(BUILD_DOUBLE),1) - OMP_NUM_THREADS=2 ./xdcblat1 -endif + OMP_NUM_THREADS=2 ./dblat1 +endif ifeq ($(BUILD_COMPLEX),1) - OMP_NUM_THREADS=2 ./xccblat1 + OMP_NUM_THREADS=2 ./cblat1 endif ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./xzcblat1 + OMP_NUM_THREADS=2 ./zblat1 endif else ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=2 ./xscblat1 + OPENBLAS_NUM_THREADS=2 ./sblat1 endif ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=2 ./xdcblat1 + OPENBLAS_NUM_THREADS=2 ./dblat1 endif ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./xccblat1 + OPENBLAS_NUM_THREADS=2 ./cblat1 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./xzcblat1 + OPENBLAS_NUM_THREADS=2 ./zblat1 +endif endif endif endif ifeq ($(BUILD_SINGLE),1) -all2targets += xscblat2 +S2=sblat2 endif ifeq ($(BUILD_DOUBLE),1) -all2targets += xdcblat2 +D2=dblat2 endif ifeq ($(BUILD_COMPLEX),1) -all2targets += xccblat2 +C2=cblat2 endif ifeq ($(BUILD_COMPLEX16),1) -all2targets += xzcblat2 +Z2=zblat2 endif -all2: $(all2targets) +level2: $(S2) $(D2) $(C2) $(Z2) + ifneq ($(CROSS), 1) + rm -f ?BLAT2.SUMM +ifeq ($(BUILD_SINGLE),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif +ifdef SMP + rm -f ?BLAT2.SUMM ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./xscblat2 < sin2 + OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 endif ifeq ($(BUILD_DOUBLE),1) - OMP_NUM_THREADS=2 ./xdcblat2 < din2 + OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX),1) - OMP_NUM_THREADS=2 ./xccblat2 < cin2 + OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./xzcblat2 < zin2 + OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif else ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2 + OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 endif ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2 + OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 + OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 + OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif endif endif endif - +ifeq ($(BUILD_BFLOAT16),1) +B3= test_sbgemm +endif ifeq ($(BUILD_SINGLE),1) -all3targets += xscblat3 +S3=sblat3 endif ifeq ($(BUILD_DOUBLE),1) -all3targets += xdcblat3 +D3=dblat3 endif ifeq ($(BUILD_COMPLEX),1) -all3targets += xccblat3 +C3=cblat3 endif ifeq ($(BUILD_COMPLEX16),1) -all3targets += xzcblat3 +Z3=zblat3 endif -all3: $(all3targets) +level3: $(B3) $(S3) $(D3) $(C3) $(Z3) + ifneq ($(CROSS), 1) -ifeq ($(USE_OPENMP), 1) + rm -f ?BLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 +endif ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./xscblat3 < sin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_DOUBLE),1) - OMP_NUM_THREADS=2 ./xdcblat3 < din3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX),1) - OMP_NUM_THREADS=2 ./xccblat3 < cin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./xzcblat3 < zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif +ifdef SMP + rm -f ?BLAT3.SUMM +ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_BFLOAT16),1) + OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif -else ifeq ($(BUILD_SINGLE),1) - OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3 + OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_DOUBLE),1) - OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3 + OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3 + OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 -endif + OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif +else +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif - -all3_3m: xzcblat3_3m xccblat3_3m -ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_SINGLE),1) - OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m + OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 endif -ifeq ($(BUILD_COMPLEX16),1) - OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m +ifeq ($(BUILD_DOUBLE),1) + OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 endif -else ifeq ($(BUILD_COMPLEX),1) - OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m + OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_COMPLEX16),1) - OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m + OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif endif endif +endif + +level3_3m : zblat3_3m cblat3_3m +ifneq ($(CROSS), 1) + rm -f ?BLAT3_3M.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat + @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat + @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +ifdef SMP + rm -f ?BLAT3_3M.SUMM +ifeq ($(USE_OPENMP), 1) + OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat + @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 + OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat + @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +else + OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat + @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat + @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0 +endif +endif +endif -clean :: - rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) + +ifeq ($(CORE), C910V) +EXTRALIB = +CEXTRALIB = +endif + ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) @@ -215,54 +273,90 @@ endif endif ifeq ($(BUILD_SINGLE),1) -# Single real -xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_DOUBLE),1) -# Double real -xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +else +dblat2: +dblat3: endif +qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + ifeq ($(BUILD_COMPLEX),1) -# Single complex -xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) -endif +cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif ifeq ($(BUILD_COMPLEX16),1) -# Double complex -xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif -xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) - $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_BFLOAT16),1) +test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif -include $(TOPDIR)/Makefile.tail +ifeq ($(BUILD_COMPLEX),1) +cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX16),1) +zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + + + +clean: + @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ + sblat1 dblat1 cblat1 zblat1 \ + sblat2 dblat2 cblat2 zblat2 \ + test_sbgemm sblat3 dblat3 cblat3 zblat3 \ + sblat1p dblat1p cblat1p zblat1p \ + sblat2p dblat2p cblat2p zblat2p \ + sblat3p dblat3p cblat3p zblat3p \ + zblat3_3m zblat3_3mp \ + cblat3_3m cblat3_3mp \ + *.stackdump *.dll + +libs: + +prof: + +quick : + $(MAKE) -C $(TOPDIR) libs + +# include ../Makefile.tail From 02562949218dded905b100cff21eae15364598ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 17:41:44 +0100 Subject: [PATCH 072/121] Fix syntax mixup --- common_power.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common_power.h b/common_power.h index 6fb2af30a..a49197fd7 100644 --- a/common_power.h +++ b/common_power.h @@ -849,8 +849,8 @@ Lmcount$lazy_ptr: #else #define BUFFER_SIZE ( 16 << 20) #endif -#ifeq ($(DYNAMIC_ARCH), 1) -#undefine BUFFER_SIZE +#ifdef DYNAMIC_ARCH +#undef BUFFER_SIZE #define BUFFER_SIZE (64 << 22) #endif From e7bf8ced6ccdc9c579ff5f8b94c20f104d98f616 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 20:20:28 +0100 Subject: [PATCH 073/121] Build fix for systems that do not support getauxval --- driver/others/dynamic_arm64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 007a221db..4f1b12f27 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -143,7 +143,7 @@ static gotoblas_t *get_coretype(void) { #if (!defined OS_LINUX && !defined OS_ANDROID) return NULL; -#endif +#else if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { #ifdef __linux @@ -235,6 +235,7 @@ static gotoblas_t *get_coretype(void) { openblas_warning(1, coremsg); } return NULL; +#endif } void gotoblas_dynamic_init(void) { From 01f01dae98abd447f3c962ba5c08498831e58f00 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 21:15:08 +0100 Subject: [PATCH 074/121] Add -msse if supported --- Makefile.x86 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.x86 b/Makefile.x86 index 330690935..f310f4973 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,5 +1,10 @@ # COMPILER_PREFIX = mingw32- +ifdef HAVE_SSE +CCOMMON_OPT += -msse +FCOMMON_OPT += -msse +endif + ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 From 11ebe5fa255eae6544f1087a2b673042894afd02 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Nov 2020 21:16:07 +0100 Subject: [PATCH 075/121] Avoid redefinition warning --- getarch.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/getarch.c b/getarch.c index 8b00aaee7..cf0be8d23 100644 --- a/getarch.c +++ b/getarch.c @@ -97,9 +97,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__x86_64__) || defined(_M_X64) #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) #else +#ifndef NO_AVX512 #define NO_AVX512 #endif #endif +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -981,20 +983,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_RISCV64_GENERIC -#define FORCE -#define ARCHITECTURE "RISCV64" -#define SUBARCHITECTURE "RISCV64_GENERIC" -#define SUBDIRNAME "riscv64" -#define ARCHCONFIG "-DRISCV64_GENERIC " \ - "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "riscv64_generic" -#define CORENAME "RISCV64_GENERIC" -#else -#endif - #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1280,21 +1268,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif -#ifdef FORCE_C910V -#define FORCE -#define ARCHITECTURE "RISCV64" -#define SUBARCHITECTURE "C910V" -#define SUBDIRNAME "riscv64" -#define ARCHCONFIG "-DC910V " \ - "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "c910v" -#define CORENAME "C910V" -#else -#endif - - #ifndef FORCE #ifdef USER_TARGET @@ -1349,10 +1322,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif -#ifdef __riscv -#include "cpuid_riscv64.c" -#endif - #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED From 358100ec15b3fff0b4ac560489c970385fb6f87b Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Sun, 29 Nov 2020 13:57:57 +0900 Subject: [PATCH 076/121] add Fujitsu compilers Co-authored-by: Tomoki Karatsu --- f_check | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/f_check b/f_check index c12b0f2ef..9ef7b8086 100644 --- a/f_check +++ b/f_check @@ -69,7 +69,12 @@ if ($compiler eq "") { $bu = "_"; } - if ($data =~ /GNU/ || $data =~ /GCC/ ) { + if ($data =~ /Fujitsu/) { + + $vendor = FUJITSU; + $openmp = "-Kopenmp"; + + } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; @@ -337,8 +342,8 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) - && ($flags !~ /[0-9]+/) + && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/)) + && ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/)) && ($flags !~ /^\-l$/) ) { $linker_l .= $flags . " "; From 3b4c016110a7de5e52a76045aaa4be25965c8e6c Mon Sep 17 00:00:00 2001 From: cyy Date: Sun, 29 Nov 2020 17:17:07 +0800 Subject: [PATCH 077/121] link math lib on FreeBSD --- utest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 357e61301..0c99e0d12 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -58,7 +58,7 @@ add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) -if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") target_link_libraries(${OpenBLAS_utest_bin} m) endif() From ca17d3dc3d51589c8048f23355b2ac1cdf32771c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Nov 2020 13:19:51 +0100 Subject: [PATCH 078/121] Restore RISCV entries accidentally trashed by my PR 3005 --- getarch.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/getarch.c b/getarch.c index cf0be8d23..f107da3e9 100644 --- a/getarch.c +++ b/getarch.c @@ -983,6 +983,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_GENERIC +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1268,6 +1282,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_C910V +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "C910V" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DC910V " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "c910v" +#define CORENAME "C910V" +#else +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1322,6 +1351,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED From 2e99e2699b6d381a7d5709ad2e0dbcd0269826ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Nov 2020 15:32:17 +0100 Subject: [PATCH 079/121] Add workaround for gcc 4.6 miscompiling assembly kernels with -mavx --- Makefile.system | 1 + Makefile.x86_64 | 4 ++++ c_check | 12 +++++++++++ getarch.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/Makefile.system b/Makefile.system index afc8ee207..b5974f872 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1561,6 +1561,7 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export NO_AVX2 export BUILD_BFLOAT16 export SBGEMM_UNROLL_M diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 43bfc9ecd..d806a4ed2 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -20,14 +20,18 @@ ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif +ifndef OLDGCC ifdef HAVE_AVX CCOMMON_OPT += -mavx FCOMMON_OPT += -mavx endif +endif +ifndef NO_AVX2 ifdef HAVE_AVX2 CCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2 endif +endif ifdef HAVE_FMA3 CCOMMON_OPT += -mfma FCOMMON_OPT += -mfma diff --git a/c_check b/c_check index 405963ae6..efea9b0fb 100644 --- a/c_check +++ b/c_check @@ -229,6 +229,16 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); + +if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { +$no_avx2 = 0; +$oldgcc = 0; +$data = `$compiler_name -dumpversion`; +if ($data <= 4.6) { +$no_avx2 = 1; +$oldgcc = 1; +} +} $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { eval "use File::Temp qw(tempfile)"; @@ -368,6 +378,8 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; +print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; +print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; diff --git a/getarch.c b/getarch.c index cf0be8d23..9344defb5 100644 --- a/getarch.c +++ b/getarch.c @@ -326,6 +326,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -336,6 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "haswell" #define CORENAME "HASWELL" #endif +#endif #ifdef FORCE_SKYLAKEX #ifdef NO_AVX512 @@ -551,6 +562,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ @@ -565,6 +586,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "zen" #define CORENAME "ZEN" #endif +#endif #ifdef FORCE_SSE_GENERIC @@ -983,6 +1005,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_GENERIC +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1268,6 +1304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_C910V +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "C910V" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DC910V " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "c910v" +#define CORENAME "C910V" +#else +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1322,6 +1373,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED From 62a2eb884f0d364716a94d12284e339d20ffcc29 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Nov 2020 15:33:07 +0100 Subject: [PATCH 080/121] Add SSE flags for x86 --- Makefile.x86 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile.x86 b/Makefile.x86 index f310f4973..0e27264d8 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -59,9 +59,11 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif - +ifdef HAVE_SSE2 +CCOMMON_OPT += -msse2 +FCOMMON_OPT += -msse2 +endif ifdef HAVE_SSE3 -ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 ifdef HAVE_SSSE3 @@ -73,5 +75,4 @@ CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif endif -endif From 7d46e31de1a206ea55ae31e7a0a1ae4b704458e0 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 29 Nov 2020 15:28:28 -0600 Subject: [PATCH 081/121] POWER10: Optimize dgemv_n Handling as 4x8 with vector pairs gives better performance than existing code in POWER10. --- kernel/power/dgemv_n_microk_power10.c | 150 +++++++++++++++++++-- kernel/power/dgemv_n_power10.c | 185 ++------------------------ 2 files changed, 155 insertions(+), 180 deletions(-) diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index 4be8a5f9b..e47de2cb5 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -/************************************************************************************** -* 2016/03/30 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) @@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } +static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + + double *a0; + double *a1; + double *a2; + double *a3; + double *a4; + double *a5; + double *a6; + double *a7; + long tmp; + __asm__ + ( + "lxvp 34, 0( %15) \n\t" // x0, x1 + "lxvp 38, 32( %15) \n\t" // x4, x5 + + XXSPLTD_S(58,%x14,0) // alpha, alpha + "sldi %10, %17, 3 \n\t" // lda * sizeof (double) + "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha + "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha + "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha + + "li %11, 32 \n\t" + + "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda + "add %10, %10, %10 \n\t" // 2 * lda + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha + + "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda + "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda + "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda + "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda + "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda + + "lxvp 40, 0( %3) \n\t" // a0[0], a0[1] + "lxvp 42, 0( %4) \n\t" // a1[0], a1[1] + "lxvp 44, 0( %5) \n\t" // a2[0], a2[1] + "lxvp 46, 0( %6) \n\t" // a3[0], a3[1] + "lxvp 50, 0( %7) \n\t" // a4[0] + "lxvp 52, 0( %8) \n\t" // a5[0] + "lxvp 54, 0( %9) \n\t" // a6[0] + "lxvp 56, 0( %10) \n\t" // a7[0] + + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "lxvpx 50, %7, %11 \n\t" // a4[0] + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "lxvpx 52, %8, %11 \n\t" // a5[0] + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "lxvpx 54, %9, %11 \n\t" // a6[0] + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "lxvpx 56, %10, %11 \n\t" // a7[0] + "addi %11, %11, 32 \n\t" + + "stxvp 36, 0( %2) \n\t" // y0, y1 + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "stxvp 36, 0( %2) \n\t" // y0, y1 + + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3), // 6 + "=&b" (a4), // 7 + "=&b" (a5), // 8 + "=&b" (a6), // 9 + "=&b" (a7), // 10 + "=b" (tmp) + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 14 + "r" (x), // 15 + "3" (ap), // 16 + "4" (lda) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48", + "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c index ad5f1ba0d..aba15ab4e 100644 --- a/kernel/power/dgemv_n_power10.c +++ b/kernel/power/dgemv_n_power10.c @@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -typedef __vector unsigned char vec_t; -typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef __vector_pair __attribute__((aligned(8))) vecp_t; #include "dgemv_n_microk_power10.c" -#define MMA(X, APTR, ACC) \ - rX = (vec_t *) & X; \ - rowA = *((vecp_t*)((void*)&APTR)); \ - __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); - -#define SAVE(ACC, Z) \ - rowC = (v4sf_t *) &y[Z]; \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - result[0][1] = result[1][0]; \ - result[2][1] = result[3][0]; \ - rowC[0] += valpha * result[0]; \ - rowC[1] += valpha * result[2]; - -void -dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, - FLOAT * y, FLOAT alpha) -{ - BLASLONG i, j, tmp; - FLOAT *a0 = a_ptr; - FLOAT *x1 = xo; - vector double valpha = { alpha, alpha }; - v4sf_t *rowC; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[4]; - vecp_t rowA; - vec_t *rX; - tmp = (n / 32) * 32; - for (i = 0; i < tmp; i += 32) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); - __builtin_mma_xxsetaccz (&acc2); - __builtin_mma_xxsetaccz (&acc3); - __builtin_mma_xxsetaccz (&acc4); - __builtin_mma_xxsetaccz (&acc5); - __builtin_mma_xxsetaccz (&acc6); - __builtin_mma_xxsetaccz (&acc7); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i + 0); - SAVE (&acc1, i + 4); - SAVE (&acc2, i + 8); - SAVE (&acc3, i + 12); - SAVE (&acc4, i + 16); - SAVE (&acc5, i + 20); - SAVE (&acc6, i + 24); - SAVE (&acc7, i + 28); - - } - for (i = tmp; i < n; i += 4) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i); - } -} - - #define NBMAX 4096 #ifndef HAVE_KERNEL_4x4 @@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; - BLASLONG n1; BLASLONG m1; BLASLONG m2; BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - BLASLONG lda128 = lda << 7; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8] __attribute__ ((aligned (16))); FLOAT *ybuffer; @@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); ybuffer = buffer; - BLASLONG n128 = n >> 7; - n1 = (n - (n128 * 128)) >> 2; - n2 = (n - (n128 * 128)) & 3; + BLASLONG n8 = n >> 3; + n2 = n & 3; m3 = m & 3 ; m1 = m & -4 ; @@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); - a_ptr += lda128; - x_ptr += 128; + dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda8; + x_ptr += 8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); a_ptr += lda4; @@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - FLOAT xbuffer[128] __attribute__ ((aligned (16))); BLASLONG j; - for ( j = 0; j < 128 ; j++) + for ( j = 0; j < 8 ; j++) { xbuffer[j] = x_ptr[0]; x_ptr += inc_x; } - dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); - a_ptr += lda128; + dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; From f6620229942eb7b670d13a527e2b22bc5ac05441 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Nov 2020 17:24:27 +0100 Subject: [PATCH 082/121] Move the version check to avoid overwriting unprocessed compiler data --- c_check | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/c_check b/c_check index efea9b0fb..a841df153 100644 --- a/c_check +++ b/c_check @@ -229,16 +229,6 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); - -if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { -$no_avx2 = 0; -$oldgcc = 0; -$data = `$compiler_name -dumpversion`; -if ($data <= 4.6) { -$no_avx2 = 1; -$oldgcc = 1; -} -} $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { eval "use File::Temp qw(tempfile)"; @@ -286,6 +276,15 @@ if ($data =~ /HAVE_C11/) { } } +if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { + $no_avx2 = 0; + $oldgcc = 0; + $data = `$compiler_name -dumpversion`; + if ($data <= 4.6) { + $no_avx2 = 1; + $oldgcc = 1; + } +} $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; From 22574b474eec3220b4fe78257f66898281502bd5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Nov 2020 21:41:51 +0100 Subject: [PATCH 083/121] Suppress -mfma as well for gcc 4.6 --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index d806a4ed2..00967bcb6 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -32,10 +32,12 @@ CCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2 endif endif +ifndef OLDGCC ifdef HAVE_FMA3 CCOMMON_OPT += -mfma FCOMMON_OPT += -mfma endif +endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH From b766c1e9bb592396b0c71ba47bf48e83534ca52c Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Tue, 1 Dec 2020 16:49:26 +0800 Subject: [PATCH 084/121] Improve the performance of zasum and casum with AVX512 intrinsic --- kernel/x86_64/KERNEL.SKYLAKEX | 3 + kernel/x86_64/casum.c | 144 ++++++++++ kernel/x86_64/casum_microk_skylakex-2.c | 349 ++++++++++++++++++++++++ kernel/x86_64/zasum.c | 144 ++++++++++ kernel/x86_64/zasum_microk_skylakex-2.c | 340 +++++++++++++++++++++++ 5 files changed, 980 insertions(+) create mode 100644 kernel/x86_64/casum.c create mode 100644 kernel/x86_64/casum_microk_skylakex-2.c create mode 100644 kernel/x86_64/zasum.c create mode 100644 kernel/x86_64/zasum_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 9b8b84c30..3d71584fe 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,3 +27,6 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c + +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c new file mode 100644 index 000000000..dce30e9b0 --- /dev/null +++ b/kernel/x86_64/casum.c @@ -0,0 +1,144 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "casum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_CASUM_KERNEL +static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + } + + while (i < n) { + sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = casum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c new file mode 100644 index 000000000..d51929f9f --- /dev/null +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -0,0 +1,349 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_CASUM_KERNEL 1 + +#include + +#include + +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + if (n2 < 64) { + __m128 accum_10, accum_11, accum_12, accum_13; + __m128 abs_mask1; + + accum_10 = _mm_setzero_ps(); + accum_11 = _mm_setzero_ps(); + accum_12 = _mm_setzero_ps(); + accum_13 = _mm_setzero_ps(); + + abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); + abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + + if (n2 >= 32){ + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + _mm_prefetch(&x1[16], _MM_HINT_T0); + __m128 x04 = _mm_loadu_ps(&x1[16]); + __m128 x05 = _mm_loadu_ps(&x1[20]); + __m128 x06 = _mm_loadu_ps(&x1[24]); + __m128 x07 = _mm_loadu_ps(&x1[28]); + + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + x02 = _mm_and_ps(x02, abs_mask1); + x03 = _mm_and_ps(x03, abs_mask1); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + x04 = _mm_and_ps(x04, abs_mask1); + x05 = _mm_and_ps(x05, abs_mask1); + x06 = _mm_and_ps(x06, abs_mask1); + x07 = _mm_and_ps(x07, abs_mask1); + + accum_10 = _mm_add_ps(accum_10, x04); + accum_11 = _mm_add_ps(accum_11, x05); + accum_12 = _mm_add_ps(accum_12, x06); + accum_13 = _mm_add_ps(accum_13, x07); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + x02 = _mm_and_ps(x02, abs_mask1); + x03 = _mm_and_ps(x03, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + x00 = _mm_and_ps(x00, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + sumf += (ABS_K(x1[0]) + ABS_K(x1[1])); + } + + accum_10 = _mm_add_ps(accum_10, accum_11); + accum_12 = _mm_add_ps(accum_12, accum_13); + accum_10 = _mm_add_ps(accum_10, accum_12); + + accum_10 = _mm_hadd_ps(accum_10, accum_10); + accum_10 = _mm_hadd_ps(accum_10, accum_10); + + sumf += accum_10[0]; + } + else { + __m512 accum_0, accum_1, accum_2, accum_3; + __m512 x00, x01, x02, x03, x04, x05, x06, x07; + __m512 abs_mask = (__m512)_mm512_set1_epi32(0x7fffffff); + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 256) { + if (n2 >= 128) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[ 16]); + x02 = _mm512_loadu_ps(&x1[ 32]); + x03 = _mm512_loadu_ps(&x1[ 48]); + x04 = _mm512_loadu_ps(&x1[ 64]); + x05 = _mm512_loadu_ps(&x1[ 80]); + x06 = _mm512_loadu_ps(&x1[ 96]); + x07 = _mm512_loadu_ps(&x1[112]); + + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + n2 -= 128; + x1 += 128; + } + + if (n2 >= 64) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x02 = _mm512_loadu_ps(&x1[32]); + x03 = _mm512_loadu_ps(&x1[48]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + } + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + + sumf = _mm512_reduce_add_ps(accum_0); + } + // n2 >= 256, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (0 != align_header) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[ 16]); + x02 = _mm512_load_ps(&x1[ 32]); + x03 = _mm512_load_ps(&x1[ 48]); + x04 = _mm512_load_ps(&x1[ 64]); + x05 = _mm512_load_ps(&x1[ 80]); + x06 = _mm512_load_ps(&x1[ 96]); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + + while (n2 >= 128) { + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + x00 = _mm512_load_ps(&x1[ 0]); + accum_1 = _mm512_add_ps(accum_1, x01); + x01 = _mm512_load_ps(&x1[ 16]); + accum_2 = _mm512_add_ps(accum_2, x02); + x02 = _mm512_load_ps(&x1[ 32]); + accum_3 = _mm512_add_ps(accum_3, x03); + x03 = _mm512_load_ps(&x1[ 48]); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x04); + x04 = _mm512_load_ps(&x1[ 64]); + accum_1 = _mm512_add_ps(accum_1, x05); + x05 = _mm512_load_ps(&x1[ 80]); + accum_2 = _mm512_add_ps(accum_2, x06); + x06 = _mm512_load_ps(&x1[ 96]); + accum_3 = _mm512_add_ps(accum_3, x07); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + } + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + if (n2 >= 64) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x02 = _mm512_load_ps(&x1[32]); + x03 = _mm512_load_ps(&x1[48]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_ps(&x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + } + + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + sumf = _mm512_reduce_add_ps(accum_0); + } + } + + return sumf; +} +#endif diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c new file mode 100644 index 000000000..514ce2434 --- /dev/null +++ b/kernel/x86_64/zasum.c @@ -0,0 +1,144 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "zasum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_ZASUM_KERNEL +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x1[0]); + temp1 = ABS_K(x1[1]); + temp2 = ABS_K(x1[2]); + temp3 = ABS_K(x1[3]); + temp4 = ABS_K(x1[4]); + temp5 = ABS_K(x1[5]); + temp6 = ABS_K(x1[6]); + temp7 = ABS_K(x1[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += ABS_K(x1[0]) + ABS_K(x1[1]); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = zasum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c new file mode 100644 index 000000000..b44c53801 --- /dev/null +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -0,0 +1,340 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_ZASUM_KERNEL 1 + +#include + +#include + +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + + if (n2 < 32) { + __m128d accum_10, accum_11, accum_12, accum_13; + __m128d abs_mask1; + + accum_10 = _mm_setzero_pd(); + accum_11 = _mm_setzero_pd(); + accum_12 = _mm_setzero_pd(); + accum_13 = _mm_setzero_pd(); + + // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); + abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); + abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + if (n2 >= 16){ + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + _mm_prefetch(&x1[8], _MM_HINT_T0); + __m128d x04 = _mm_loadu_pd(&x1[ 8]); + __m128d x05 = _mm_loadu_pd(&x1[10]); + __m128d x06 = _mm_loadu_pd(&x1[12]); + __m128d x07 = _mm_loadu_pd(&x1[14]); + + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + x02 = _mm_and_pd(x02, abs_mask1); + x03 = _mm_and_pd(x03, abs_mask1); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + x04 = _mm_and_pd(x04, abs_mask1); + x05 = _mm_and_pd(x05, abs_mask1); + x06 = _mm_and_pd(x06, abs_mask1); + x07 = _mm_and_pd(x07, abs_mask1); + + accum_10 = _mm_add_pd(accum_10, x04); + accum_11 = _mm_add_pd(accum_11, x05); + accum_12 = _mm_add_pd(accum_12, x06); + accum_13 = _mm_add_pd(accum_13, x07); + + x1 += 16; + n2 -= 16; + } + + if (n2 >= 8) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + x02 = _mm_and_pd(x02, abs_mask1); + x03 = _mm_and_pd(x03, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + x00 = _mm_and_pd(x00, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + } + + accum_10 = _mm_add_pd(accum_10, accum_11); + accum_12 = _mm_add_pd(accum_12, accum_13); + accum_10 = _mm_add_pd(accum_10, accum_12); + + accum_10 = _mm_hadd_pd(accum_10, accum_10); + + sumf = accum_10[0]; + } + else { + __m512d accum_0, accum_1, accum_2, accum_3; + __m512d x00, x01, x02, x03, x04, x05, x06, x07; + __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff); + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 128) { + if (n2 >= 64) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x04 = _mm512_loadu_pd(&x1[32]); + x05 = _mm512_loadu_pd(&x1[40]); + x06 = _mm512_loadu_pd(&x1[48]); + x07 = _mm512_loadu_pd(&x1[56]); + + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + } + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + // n2 >= 128, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (0 != align_header) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x04 = _mm512_load_pd(&x1[32]); + x05 = _mm512_load_pd(&x1[40]); + x06 = _mm512_load_pd(&x1[48]); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + + while (n2 >= 64) { + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + x00 = _mm512_load_pd(&x1[ 0]); + accum_1 = _mm512_add_pd(accum_1, x01); + x01 = _mm512_load_pd(&x1[ 8]); + accum_2 = _mm512_add_pd(accum_2, x02); + x02 = _mm512_load_pd(&x1[16]); + accum_3 = _mm512_add_pd(accum_3, x03); + x03 = _mm512_load_pd(&x1[24]); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x04); + x04 = _mm512_load_pd(&x1[32]); + accum_1 = _mm512_add_pd(accum_1, x05); + x05 = _mm512_load_pd(&x1[40]); + accum_2 = _mm512_add_pd(accum_2, x06); + x06 = _mm512_load_pd(&x1[48]); + accum_3 = _mm512_add_pd(accum_3, x07); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + } + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + if (n2 >= 32) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_load_pd(&x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + } + + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + } + + return sumf; +} +#endif From 9621062ebabcfb8f75a318fbcaf9558b26de9799 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Dec 2020 12:23:30 +0100 Subject: [PATCH 085/121] Update OSX xcode version to 11.5 --- .travis.yml | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3f917ce72..909d1eddb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -211,7 +211,7 @@ matrix: - &test-macos os: osx - osx_image: xcode10.1 + osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update @@ -238,17 +238,23 @@ matrix: - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos - osx_image: xcode10.1 + osx_image: xcode11.5 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" +# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode10.1 + osx_image: xcode11.5 env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" +# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" + - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - &test-graviton2 From 77a538d4ba34b2736014346285006b43ece2d0a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Dec 2020 22:05:35 +0100 Subject: [PATCH 086/121] Update an overlooked instance of xcode 10.0 as well --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 909d1eddb..7fe2ab388 100644 --- a/.travis.yml +++ b/.travis.yml @@ -233,7 +233,7 @@ matrix: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - <<: *test-macos - osx_image: xcode10.0 + osx_image: xcode11.5 env: - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" From 0cb7a403b25ebd623f9de97123742c0274fb7147 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Wed, 2 Dec 2020 09:51:52 +0800 Subject: [PATCH 087/121] fix error declare function blas_level1_thread_with_return_value --- kernel/x86_64/casum.c | 2 +- kernel/x86_64/zasum.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index dce30e9b0..a1bd76f33 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -93,7 +93,7 @@ static int asum_thread_function(BLASLONG n, return 0; } -extern int blas_level1_thread_with_value(int mode, +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 514ce2434..6e758e2e3 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -93,7 +93,7 @@ static int asum_thread_function(BLASLONG n, return 0; } -extern int blas_level1_thread_with_value(int mode, +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, From c361313564b9909aea1587435d56a0f5ffe8fcf7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Dec 2020 07:49:43 +0100 Subject: [PATCH 088/121] Disable deprecated 32bit xcode --- .travis.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7fe2ab388..d532899fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -214,8 +214,6 @@ matrix: osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: @@ -232,10 +230,10 @@ matrix: env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - - <<: *test-macos - osx_image: xcode11.5 - env: - - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" + # - <<: *test-macos + # osx_image: xcode10 + # env: + # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos osx_image: xcode11.5 From 57456c248b6b240d396cc628b4e361836afb1a10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Dec 2020 15:56:21 +0100 Subject: [PATCH 089/121] fix gfortran requirement in osx interface64 test --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d532899fe..83237662f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -216,8 +216,10 @@ matrix: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - brew update + - brew install gcc-10 env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - <<: *test-macos osx_image: xcode12 From dcbb3b5ef1e2aecad926526d21cf080d659eb6fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Dec 2020 23:13:13 +0100 Subject: [PATCH 090/121] fix misplaced lines --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 83237662f..771e70d42 100644 --- a/.travis.yml +++ b/.travis.yml @@ -214,10 +214,10 @@ matrix: osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - - brew update - - brew install gcc-10 env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" From 72a553f5bc032a2c9fdb08729e6a5e8a0b722d07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 09:17:27 +0100 Subject: [PATCH 091/121] Update .travis.yml --- .travis.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 771e70d42..6c5fb2f96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -214,23 +214,19 @@ matrix: osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" - <<: *test-macos osx_image: xcode12 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@10 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" # - <<: *test-macos # osx_image: xcode10 From a6692dc129acdd317f011c6dab1ea0a7e5080931 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 14:32:21 +0100 Subject: [PATCH 092/121] use gfortran-10 with xcode 12 --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6c5fb2f96..bde0e202d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -223,10 +223,12 @@ matrix: osx_image: xcode12 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" # - <<: *test-macos # osx_image: xcode10 From da0c94c76f1494b50274e9e41227a3f15e4765ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 21:25:57 +0100 Subject: [PATCH 093/121] Avoid linking both GNU libgomp and LLVM libomp in clang/gfortran builds --- f_check | 3 +++ 1 file changed, 3 insertions(+) diff --git a/f_check b/f_check index 9ef7b8086..cb869b3bb 100644 --- a/f_check +++ b/f_check @@ -330,6 +330,9 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } + if ($flags =~ /-lgomp/ && $CC == /clang/) { + $flags = "-lomp"; + } if ( ($flags =~ /^\-l/) From 74b585058145ee362ab57fbcbbc5c0d19332b432 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 21:28:10 +0100 Subject: [PATCH 094/121] Add libomp to the LAPACK(-test) dependencies in clang/gfortran builds --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index a9af62a22..54dd3be41 100644 --- a/Makefile +++ b/Makefile @@ -268,7 +268,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc From 41fe6e864ed70860cda1b1ccef09b55caf41fec9 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 3 Dec 2020 14:40:11 -0600 Subject: [PATCH 095/121] POWER10: Update param.h Increasing the values of DGEMM_DEFAULT_P and DGEMM_DEFAULT_Q helps in improving performance ~10% for DGEMM. --- param.h | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index 7789c83c7..ee5ad17fb 100644 --- a/param.h +++ b/param.h @@ -2388,7 +2388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) || defined(POWER10) +#if defined(POWER9) #define SNUMOPT 16 #define DNUMOPT 8 @@ -2426,6 +2426,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 832 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 1026 +#define DGEMM_DEFAULT_Q 960 +#define CGEMM_DEFAULT_Q 1026 +#define ZGEMM_DEFAULT_Q 1026 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 8 + #undef SBGEMM_DEFAULT_UNROLL_N #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_P @@ -2436,10 +2469,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 832 #define SBGEMM_DEFAULT_Q 1026 #define SBGEMM_DEFAULT_R 4096 -#undef DGEMM_DEFAULT_UNROLL_M -#undef DGEMM_DEFAULT_UNROLL_N -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 8 #endif #if defined(SPARC) && defined(V7) From a1eecccda28cf7d00a5ffbbcd5afb4ca6ef6c6a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 23:43:17 +0100 Subject: [PATCH 096/121] Update f_check --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index cb869b3bb..42241ae10 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC == /clang/) { + if ($flags =~ /-lgomp/ && $CC =~ /clang/) { $flags = "-lomp"; } From 213c0e7abb6ab909479e8e956b159c040a1782f8 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 4 Dec 2020 17:07:06 -0600 Subject: [PATCH 097/121] Added special unrolled vectorized versions of "Solve" for specific sizes, in DTRSM and STRSM, to improve performance in Power9 and Power10. --- kernel/power/KERNEL.POWER10 | 18 +- kernel/power/KERNEL.POWER9 | 14 +- kernel/power/trsm_kernel_LN_power10.c | 1280 +++++++++++++++++++++++++ kernel/power/trsm_kernel_LT_power10.c | 1265 ++++++++++++++++++++++++ kernel/power/trsm_kernel_RN_power10.c | 828 ++++++++++++++++ kernel/power/trsm_kernel_RT_power10.c | 855 +++++++++++++++++ 6 files changed, 4244 insertions(+), 16 deletions(-) create mode 100644 kernel/power/trsm_kernel_LN_power10.c create mode 100644 kernel/power/trsm_kernel_LT_power10.c create mode 100644 kernel/power/trsm_kernel_RN_power10.c create mode 100644 kernel/power/trsm_kernel_RT_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c25cd9f04..d61f5194a 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c + +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c +DTRSMKERNEL_LT = trsm_kernel_LT_power10.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index ab8fbfcd9..2bd2516de 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c new file mode 100644 index 000000000..5ca1603a6 --- /dev/null +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -0,0 +1,1280 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + VbS7 = vec_splat(Vb[31], 1); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + c0[6] -= c0[7] * a[62]; + c1[6] -= c1[7] * a[62]; + c2[6] -= c2[7] * a[62]; + c3[6] -= c3[7] * a[62]; + c4[6] -= c4[7] * a[62]; + c5[6] -= c5[7] * a[62]; + c6[6] -= c6[7] * a[62]; + c7[6] -= c7[7] * a[62]; + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + VbS6 = vec_splat(Vb[27], 0); + VbS7 = vec_splat(Vb[27], 1); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]); + c0[4] -= c0[5] * a[44]; + c1[4] -= c1[5] * a[44]; + c2[4] -= c2[5] * a[44]; + c3[4] -= c3[5] * a[44]; + c4[4] -= c4[5] * a[44]; + c5[4] -= c5[5] * a[44]; + c6[4] -= c6[5] * a[44]; + c7[4] -= c7[5] * a[44]; + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]); + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]); + c0[2] -= c0[3] * a[26]; + c1[2] -= c1[3] * a[26]; + c2[2] -= c2[3] * a[26]; + c3[2] -= c3[3] * a[26]; + c4[2] -= c4[3] * a[26]; + c5[2] -= c5[3] * a[26]; + c6[2] -= c6[3] * a[26]; + c7[2] -= c7[3] * a[26]; + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]); + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + c0[0] -= c0[1] * a[8]; + c1[0] -= c1[1] * a[8]; + c2[0] -= c2[1] * a[8]; + c3[0] -= c3[1] * a[8]; + c4[0] -= c4[1] * a[8]; + c5[0] -= c5[1] * a[8]; + c6[0] -= c6[1] * a[8]; + c7[0] -= c7[1] * a[8]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); + VbS0 = vec_splat(Vb[30], 0); + VbS1 = vec_splat(Vb[30], 1); + VbS2 = vec_splat(Vb[30], 2); + VbS3 = vec_splat(Vb[30], 3); + VbS4 = vec_splat(Vb[31], 0); + VbS5 = vec_splat(Vb[31], 1); + VbS6 = vec_splat(Vb[31], 2); + VbS7 = vec_splat(Vb[31], 3); + Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]); + c0[12] -= b[120] * a[252]; + c0[13] -= b[120] * a[253]; + c0[14] -= b[120] * a[254]; + c1[12] -= b[121] * a[252]; + c1[13] -= b[121] * a[253]; + c1[14] -= b[121] * a[254]; + c2[12] -= b[122] * a[252]; + c2[13] -= b[122] * a[253]; + c2[14] -= b[122] * a[254]; + c3[12] -= b[123] * a[252]; + c3[13] -= b[123] * a[253]; + c3[14] -= b[123] * a[254]; + c4[12] -= b[124] * a[252]; + c4[13] -= b[124] * a[253]; + c4[14] -= b[124] * a[254]; + c5[12] -= b[125] * a[252]; + c5[13] -= b[125] * a[253]; + c5[14] -= b[125] * a[254]; + c6[12] -= b[126] * a[252]; + c6[13] -= b[126] * a[253]; + c6[14] -= b[126] * a[254]; + c7[12] -= b[127] * a[252]; + c7[13] -= b[127] * a[253]; + c7[14] -= b[127] * a[254]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[28], 2); + VbS3 = vec_splat(Vb[28], 3); + VbS4 = vec_splat(Vb[29], 0); + VbS5 = vec_splat(Vb[29], 1); + VbS6 = vec_splat(Vb[29], 2); + VbS7 = vec_splat(Vb[29], 3); + Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]); + c0[12] -= b[112] * a[236]; + c0[13] -= b[112] * a[237]; + c1[12] -= b[113] * a[236]; + c1[13] -= b[113] * a[237]; + c2[12] -= b[114] * a[236]; + c2[13] -= b[114] * a[237]; + c3[12] -= b[115] * a[236]; + c3[13] -= b[115] * a[237]; + c4[12] -= b[116] * a[236]; + c4[13] -= b[116] * a[237]; + c5[12] -= b[117] * a[236]; + c5[13] -= b[117] * a[237]; + c6[12] -= b[118] * a[236]; + c6[13] -= b[118] * a[237]; + c7[12] -= b[119] * a[236]; + c7[13] -= b[119] * a[237]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + VbS0 = vec_splat(Vb[26], 0); + VbS1 = vec_splat(Vb[26], 1); + VbS2 = vec_splat(Vb[26], 2); + VbS3 = vec_splat(Vb[26], 3); + VbS4 = vec_splat(Vb[27], 0); + VbS5 = vec_splat(Vb[27], 1); + VbS6 = vec_splat(Vb[27], 2); + VbS7 = vec_splat(Vb[27], 3); + Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]); + c0[12] -= b[104] * a[220]; + c1[12] -= b[105] * a[220]; + c2[12] -= b[106] * a[220]; + c3[12] -= b[107] * a[220]; + c4[12] -= b[108] * a[220]; + c5[12] -= b[109] * a[220]; + c6[12] -= b[110] * a[220]; + c7[12] -= b[111] * a[220]; + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[24], 2); + VbS3 = vec_splat(Vb[24], 3); + VbS4 = vec_splat(Vb[25], 0); + VbS5 = vec_splat(Vb[25], 1); + VbS6 = vec_splat(Vb[25], 2); + VbS7 = vec_splat(Vb[25], 3); + Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]); + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]); + c0[ 8] -= b[88] * a[184]; + c0[ 9] -= b[88] * a[185]; + c0[10] -= b[88] * a[186]; + c1[ 8] -= b[89] * a[184]; + c1[ 9] -= b[89] * a[185]; + c1[10] -= b[89] * a[186]; + c2[ 8] -= b[90] * a[184]; + c2[ 9] -= b[90] * a[185]; + c2[10] -= b[90] * a[186]; + c3[ 8] -= b[91] * a[184]; + c3[ 9] -= b[91] * a[185]; + c3[10] -= b[91] * a[186]; + c4[ 8] -= b[92] * a[184]; + c4[ 9] -= b[92] * a[185]; + c4[10] -= b[92] * a[186]; + c5[ 8] -= b[93] * a[184]; + c5[ 9] -= b[93] * a[185]; + c5[10] -= b[93] * a[186]; + c6[ 8] -= b[94] * a[184]; + c6[ 9] -= b[94] * a[185]; + c6[10] -= b[94] * a[186]; + c7[ 8] -= b[95] * a[184]; + c7[ 9] -= b[95] * a[185]; + c7[10] -= b[95] * a[186]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]); + c0[8] -= b[80] * a[168]; + c0[9] -= b[80] * a[169]; + c1[8] -= b[81] * a[168]; + c1[9] -= b[81] * a[169]; + c2[8] -= b[82] * a[168]; + c2[9] -= b[82] * a[169]; + c3[8] -= b[83] * a[168]; + c3[9] -= b[83] * a[169]; + c4[8] -= b[84] * a[168]; + c4[9] -= b[84] * a[169]; + c5[8] -= b[85] * a[168]; + c5[9] -= b[85] * a[169]; + c6[8] -= b[86] * a[168]; + c6[9] -= b[86] * a[169]; + c7[8] -= b[87] * a[168]; + c7[9] -= b[87] * a[169]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]); + c0[8] -= b[72] * a[152]; + c1[8] -= b[73] * a[152]; + c2[8] -= b[74] * a[152]; + c3[8] -= b[75] * a[152]; + c4[8] -= b[76] * a[152]; + c5[8] -= b[77] * a[152]; + c6[8] -= b[78] * a[152]; + c7[8] -= b[79] * a[152]; + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]); + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + c0[4] -= b[56] * a[116]; + c0[5] -= b[56] * a[117]; + c0[6] -= b[56] * a[118]; + c1[4] -= b[57] * a[116]; + c1[5] -= b[57] * a[117]; + c1[6] -= b[57] * a[118]; + c2[4] -= b[58] * a[116]; + c2[5] -= b[58] * a[117]; + c2[6] -= b[58] * a[118]; + c3[4] -= b[59] * a[116]; + c3[5] -= b[59] * a[117]; + c3[6] -= b[59] * a[118]; + c4[4] -= b[60] * a[116]; + c4[5] -= b[60] * a[117]; + c4[6] -= b[60] * a[118]; + c5[4] -= b[61] * a[116]; + c5[5] -= b[61] * a[117]; + c5[6] -= b[61] * a[118]; + c6[4] -= b[62] * a[116]; + c6[5] -= b[62] * a[117]; + c6[6] -= b[62] * a[118]; + c7[4] -= b[63] * a[116]; + c7[5] -= b[63] * a[117]; + c7[6] -= b[63] * a[118]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + c0[4] -= b[48] * a[100]; + c0[5] -= b[48] * a[101]; + c1[4] -= b[49] * a[100]; + c1[5] -= b[49] * a[101]; + c2[4] -= b[50] * a[100]; + c2[5] -= b[50] * a[101]; + c3[4] -= b[51] * a[100]; + c3[5] -= b[51] * a[101]; + c4[4] -= b[52] * a[100]; + c4[5] -= b[52] * a[101]; + c5[4] -= b[53] * a[100]; + c5[5] -= b[53] * a[101]; + c6[4] -= b[54] * a[100]; + c6[5] -= b[54] * a[101]; + c7[4] -= b[55] * a[100]; + c7[5] -= b[55] * a[101]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + c0[4] -= b[40] * a[84]; + c1[4] -= b[41] * a[84]; + c2[4] -= b[42] * a[84]; + c3[4] -= b[43] * a[84]; + c4[4] -= b[44] * a[84]; + c5[4] -= b[45] * a[84]; + c6[4] -= b[46] * a[84]; + c7[4] -= b[47] * a[84]; + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + c0[0] -= b[24] * a[48]; + c0[1] -= b[24] * a[49]; + c0[2] -= b[24] * a[50]; + c1[0] -= b[25] * a[48]; + c1[1] -= b[25] * a[49]; + c1[2] -= b[25] * a[50]; + c2[0] -= b[26] * a[48]; + c2[1] -= b[26] * a[49]; + c2[2] -= b[26] * a[50]; + c3[0] -= b[27] * a[48]; + c3[1] -= b[27] * a[49]; + c3[2] -= b[27] * a[50]; + c4[0] -= b[28] * a[48]; + c4[1] -= b[28] * a[49]; + c4[2] -= b[28] * a[50]; + c5[0] -= b[29] * a[48]; + c5[1] -= b[29] * a[49]; + c5[2] -= b[29] * a[50]; + c6[0] -= b[30] * a[48]; + c6[1] -= b[30] * a[49]; + c6[2] -= b[30] * a[50]; + c7[0] -= b[31] * a[48]; + c7[1] -= b[31] * a[49]; + c7[2] -= b[31] * a[50]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + c0[0] -= b[16] * a[32]; + c0[1] -= b[16] * a[33]; + c1[0] -= b[17] * a[32]; + c1[1] -= b[17] * a[33]; + c2[0] -= b[18] * a[32]; + c2[1] -= b[18] * a[33]; + c3[0] -= b[19] * a[32]; + c3[1] -= b[19] * a[33]; + c4[0] -= b[20] * a[32]; + c4[1] -= b[20] * a[33]; + c5[0] -= b[21] * a[32]; + c5[1] -= b[21] * a[33]; + c6[0] -= b[22] * a[32]; + c6[1] -= b[22] * a[33]; + c7[0] -= b[23] * a[32]; + c7[1] -= b[23] * a[33]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + c0[0] -= b[ 8] * a[16]; + c1[0] -= b[ 9] * a[16]; + c2[0] -= b[10] * a[16]; + c3[0] -= b[11] * a[16]; + c4[0] -= b[12] * a[16]; + c5[0] -= b[13] * a[16]; + c6[0] -= b[14] * a[16]; + c7[0] -= b[15] * a[16]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c new file mode 100644 index 000000000..14ff12fe4 --- /dev/null +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -0,0 +1,1265 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[1], 0); + VbS3 = vec_splat(Vb[1], 1); + VbS4 = vec_splat(Vb[2], 0); + VbS5 = vec_splat(Vb[2], 1); + VbS6 = vec_splat(Vb[3], 0); + VbS7 = vec_splat(Vb[3], 1); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= c0[0] * a[1]; + c1[1] -= c1[0] * a[1]; + c2[1] -= c2[0] * a[1]; + c3[1] -= c3[0] * a[1]; + c4[1] -= c4[0] * a[1]; + c5[1] -= c5[0] * a[1]; + c6[1] -= c6[0] * a[1]; + c7[1] -= c7[0] * a[1]; + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[6], 0); + VbS5 = vec_splat(Vb[6], 1); + VbS6 = vec_splat(Vb[7], 0); + VbS7 = vec_splat(Vb[7], 1); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= c0[2] * a[19]; + c1[3] -= c1[2] * a[19]; + c2[3] -= c2[2] * a[19]; + c3[3] -= c3[2] * a[19]; + c4[3] -= c4[2] * a[19]; + c5[3] -= c5[2] * a[19]; + c6[3] -= c6[2] * a[19]; + c7[3] -= c7[2] * a[19]; + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= c0[4] * a[37]; + c1[5] -= c1[4] * a[37]; + c2[5] -= c2[4] * a[37]; + c3[5] -= c3[4] * a[37]; + c4[5] -= c4[4] * a[37]; + c5[5] -= c5[4] * a[37]; + c6[5] -= c6[4] * a[37]; + c7[5] -= c7[4] * a[37]; + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + c0[7] -= c0[6] * a[55]; + c1[7] -= c1[6] * a[55]; + c2[7] -= c2[6] * a[55]; + c3[7] -= c3[6] * a[55]; + c4[7] -= c4[6] * a[55]; + c5[7] -= c5[6] * a[55]; + c6[7] -= c6[6] * a[55]; + c7[7] -= c7[6] * a[55]; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= b[0] * a[ 1]; + c0[2] -= b[0] * a[ 2]; + c0[3] -= b[0] * a[ 3]; + c1[1] -= b[1] * a[ 1]; + c1[2] -= b[1] * a[ 2]; + c1[3] -= b[1] * a[ 3]; + c2[1] -= b[2] * a[ 1]; + c2[2] -= b[2] * a[ 2]; + c2[3] -= b[2] * a[ 3]; + c3[1] -= b[3] * a[ 1]; + c3[2] -= b[3] * a[ 2]; + c3[3] -= b[3] * a[ 3]; + c4[1] -= b[4] * a[ 1]; + c4[2] -= b[4] * a[ 2]; + c4[3] -= b[4] * a[ 3]; + c5[1] -= b[5] * a[ 1]; + c5[2] -= b[5] * a[ 2]; + c5[3] -= b[5] * a[ 3]; + c6[1] -= b[6] * a[ 1]; + c6[2] -= b[6] * a[ 2]; + c6[3] -= b[6] * a[ 3]; + c7[1] -= b[7] * a[ 1]; + c7[2] -= b[7] * a[ 2]; + c7[3] -= b[7] * a[ 3]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + VbS2 = vec_splat(Vb[2], 2); + VbS3 = vec_splat(Vb[2], 3); + VbS4 = vec_splat(Vb[3], 0); + VbS5 = vec_splat(Vb[3], 1); + VbS6 = vec_splat(Vb[3], 2); + VbS7 = vec_splat(Vb[3], 3); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + c0[2] -= b[ 8] * a[18]; + c0[3] -= b[ 8] * a[19]; + c1[2] -= b[ 9] * a[18]; + c1[3] -= b[ 9] * a[19]; + c2[2] -= b[10] * a[18]; + c2[3] -= b[10] * a[19]; + c3[2] -= b[11] * a[18]; + c3[3] -= b[11] * a[19]; + c4[2] -= b[12] * a[18]; + c4[3] -= b[12] * a[19]; + c5[2] -= b[13] * a[18]; + c5[3] -= b[13] * a[19]; + c6[2] -= b[14] * a[18]; + c6[3] -= b[14] * a[19]; + c7[2] -= b[15] * a[18]; + c7[3] -= b[15] * a[19]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + VbS3 = vec_splat(Vb[4], 3); + VbS4 = vec_splat(Vb[5], 0); + VbS5 = vec_splat(Vb[5], 1); + VbS6 = vec_splat(Vb[5], 2); + VbS7 = vec_splat(Vb[5], 3); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= b[16] * a[35]; + c1[3] -= b[17] * a[35]; + c2[3] -= b[18] * a[35]; + c3[3] -= b[19] * a[35]; + c4[3] -= b[20] * a[35]; + c5[3] -= b[21] * a[35]; + c6[3] -= b[22] * a[35]; + c7[3] -= b[23] * a[35]; + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + VbS6 = vec_splat(Vb[7], 2); + VbS7 = vec_splat(Vb[7], 3); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= b[32] * a[69]; + c0[6] -= b[32] * a[70]; + c0[7] -= b[32] * a[71]; + c1[5] -= b[33] * a[69]; + c1[6] -= b[33] * a[70]; + c1[7] -= b[33] * a[71]; + c2[5] -= b[34] * a[69]; + c2[6] -= b[34] * a[70]; + c2[7] -= b[34] * a[71]; + c3[5] -= b[35] * a[69]; + c3[6] -= b[35] * a[70]; + c3[7] -= b[35] * a[71]; + c4[5] -= b[36] * a[69]; + c4[6] -= b[36] * a[70]; + c4[7] -= b[36] * a[71]; + c5[5] -= b[37] * a[69]; + c5[6] -= b[37] * a[70]; + c5[7] -= b[37] * a[71]; + c6[5] -= b[38] * a[69]; + c6[6] -= b[38] * a[70]; + c6[7] -= b[38] * a[71]; + c7[5] -= b[39] * a[69]; + c7[6] -= b[39] * a[70]; + c7[7] -= b[39] * a[71]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + c0[6] -= b[40] * a[86]; + c0[7] -= b[40] * a[87]; + c1[6] -= b[41] * a[86]; + c1[7] -= b[41] * a[87]; + c2[6] -= b[42] * a[86]; + c2[7] -= b[42] * a[87]; + c3[6] -= b[43] * a[86]; + c3[7] -= b[43] * a[87]; + c4[6] -= b[44] * a[86]; + c4[7] -= b[44] * a[87]; + c5[6] -= b[45] * a[86]; + c5[7] -= b[45] * a[87]; + c6[6] -= b[46] * a[86]; + c6[7] -= b[46] * a[87]; + c7[6] -= b[47] * a[86]; + c7[7] -= b[47] * a[87]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]); + c0[7] -= b[48] * a[103]; + c1[7] -= b[49] * a[103]; + c2[7] -= b[50] * a[103]; + c3[7] -= b[51] * a[103]; + c4[7] -= b[52] * a[103]; + c5[7] -= b[53] * a[103]; + c6[7] -= b[54] * a[103]; + c7[7] -= b[55] * a[103]; + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]); + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]); + c0[ 9] -= b[64] * a[137]; + c0[10] -= b[64] * a[138]; + c0[11] -= b[64] * a[139]; + c1[ 9] -= b[65] * a[137]; + c1[10] -= b[65] * a[138]; + c1[11] -= b[65] * a[139]; + c2[ 9] -= b[66] * a[137]; + c2[10] -= b[66] * a[138]; + c2[11] -= b[66] * a[139]; + c3[ 9] -= b[67] * a[137]; + c3[10] -= b[67] * a[138]; + c3[11] -= b[67] * a[139]; + c4[ 9] -= b[68] * a[137]; + c4[10] -= b[68] * a[138]; + c4[11] -= b[68] * a[139]; + c5[ 9] -= b[69] * a[137]; + c5[10] -= b[69] * a[138]; + c5[11] -= b[69] * a[139]; + c6[ 9] -= b[70] * a[137]; + c6[10] -= b[70] * a[138]; + c6[11] -= b[70] * a[139]; + c7[ 9] -= b[71] * a[137]; + c7[10] -= b[71] * a[138]; + c7[11] -= b[71] * a[139]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]); + c0[10] -= b[72] * a[154]; + c0[11] -= b[72] * a[155]; + c1[10] -= b[73] * a[154]; + c1[11] -= b[73] * a[155]; + c2[10] -= b[74] * a[154]; + c2[11] -= b[74] * a[155]; + c3[10] -= b[75] * a[154]; + c3[11] -= b[75] * a[155]; + c4[10] -= b[76] * a[154]; + c4[11] -= b[76] * a[155]; + c5[10] -= b[77] * a[154]; + c5[11] -= b[77] * a[155]; + c6[10] -= b[78] * a[154]; + c6[11] -= b[78] * a[155]; + c7[10] -= b[79] * a[154]; + c7[11] -= b[79] * a[155]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]); + c0[11] -= b[80] * a[171]; + c1[11] -= b[81] * a[171]; + c2[11] -= b[82] * a[171]; + c3[11] -= b[83] * a[171]; + c4[11] -= b[84] * a[171]; + c5[11] -= b[85] * a[171]; + c6[11] -= b[86] * a[171]; + c7[11] -= b[87] * a[171]; + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]); + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + c0[13] -= b[ 96] * a[205]; + c0[14] -= b[ 96] * a[206]; + c0[15] -= b[ 96] * a[207]; + c1[13] -= b[ 97] * a[205]; + c1[14] -= b[ 97] * a[206]; + c1[15] -= b[ 97] * a[207]; + c2[13] -= b[ 98] * a[205]; + c2[14] -= b[ 98] * a[206]; + c2[15] -= b[ 98] * a[207]; + c3[13] -= b[ 99] * a[205]; + c3[14] -= b[ 99] * a[206]; + c3[15] -= b[ 99] * a[207]; + c4[13] -= b[100] * a[205]; + c4[14] -= b[100] * a[206]; + c4[15] -= b[100] * a[207]; + c5[13] -= b[101] * a[205]; + c5[14] -= b[101] * a[206]; + c5[15] -= b[101] * a[207]; + c6[13] -= b[102] * a[205]; + c6[14] -= b[102] * a[206]; + c6[15] -= b[102] * a[207]; + c7[13] -= b[103] * a[205]; + c7[14] -= b[103] * a[206]; + c7[15] -= b[103] * a[207]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + c0[14] -= b[104] * a[222]; + c0[15] -= b[104] * a[223]; + c1[14] -= b[105] * a[222]; + c1[15] -= b[105] * a[223]; + c2[14] -= b[106] * a[222]; + c2[15] -= b[106] * a[223]; + c3[14] -= b[107] * a[222]; + c3[15] -= b[107] * a[223]; + c4[14] -= b[108] * a[222]; + c4[15] -= b[108] * a[223]; + c5[14] -= b[109] * a[222]; + c5[15] -= b[109] * a[223]; + c6[14] -= b[110] * a[222]; + c6[15] -= b[110] * a[223]; + c7[14] -= b[111] * a[222]; + c7[15] -= b[111] * a[223]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + c0[15] -= b[112] * a[239]; + c1[15] -= b[113] * a[239]; + c2[15] -= b[114] * a[239]; + c3[15] -= b[115] * a[239]; + c4[15] -= b[116] * a[239]; + c5[15] -= b[117] * a[239]; + c6[15] -= b[118] * a[239]; + c7[15] -= b[119] * a[239]; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c new file mode 100644 index 000000000..92c26fcc3 --- /dev/null +++ b/kernel/power/trsm_kernel_RN_power10.c @@ -0,0 +1,828 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); + VbS0 = vec_splat(Vb[0], 1); + VbS1 = vec_splat(Vb[1], 0); + VbS2 = vec_splat(Vb[1], 1); + VbS3 = vec_splat(Vb[2], 0); + VbS4 = vec_splat(Vb[2], 1); + VbS5 = vec_splat(Vb[3], 0); + VbS6 = vec_splat(Vb[3], 1); + Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]); + Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]); + Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]); + Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]); + Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]); + Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]); + Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]); + Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]); + Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]); + Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]); + Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]); + Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]); + Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]); + Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]); + Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]); + Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]); + Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]); + Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]); + Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]); + Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]); + Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]); + Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]); + Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]); + Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]); + Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]); + Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]); + Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]); + Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[5], 0); + VbS1 = vec_splat(Vb[5], 1); + VbS2 = vec_splat(Vb[6], 0); + VbS3 = vec_splat(Vb[6], 1); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]); + Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]); + Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]); + Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]); + Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]); + Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]); + Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]); + Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]); + Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]); + Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]); + Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]); + Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]); + Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]); + Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]); + Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]); + Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]); + Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]); + Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]); + Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]); + Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]); + Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]); + Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]); + Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]); + Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[ 9], 1); + VbS1 = vec_splat(Vb[10], 0); + VbS2 = vec_splat(Vb[10], 1); + VbS3 = vec_splat(Vb[11], 0); + VbS4 = vec_splat(Vb[11], 1); + Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]); + Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]); + Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]); + Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]); + Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]); + Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]); + Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]); + Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]); + Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]); + Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]); + Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]); + Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]); + Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]); + Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]); + Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]); + Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]); + Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]); + Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]); + Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]); + Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[15], 0); + VbS3 = vec_splat(Vb[15], 1); + Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]); + Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]); + Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]); + Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]); + Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]); + Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]); + Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]); + Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]); + Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]); + Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]); + Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]); + Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]); + Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]); + Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]); + Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]); + Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[18], 1); + VbS1 = vec_splat(Vb[19], 0); + VbS2 = vec_splat(Vb[19], 1); + Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]); + Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]); + Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]); + Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]); + Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]); + Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]); + Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]); + Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]); + Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]); + Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]); + Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]); + Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[23], 0); + VbS1 = vec_splat(Vb[23], 1); + Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]); + Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]); + Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]); + Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]); + Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]); + Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]); + Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]); + Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[27], 1); + Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]); + Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]); + Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]); + Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]); + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + + Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]); + Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]); + Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]); + Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; + Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + + VbS0 = vec_splat(Vb[2], 1); + VbS1 = vec_splat(Vb[2], 2); + VbS2 = vec_splat(Vb[2], 3); + VbS3 = vec_splat(Vb[3], 0); + VbS4 = vec_splat(Vb[3], 1); + VbS5 = vec_splat(Vb[3], 2); + VbS6 = vec_splat(Vb[3], 3); + + Vc1[0] = vec_mul(VbS0, Vc1[0]); + Vc1[1] = vec_mul(VbS0, Vc1[1]); + Vc1[2] = vec_mul(VbS0, Vc1[2]); + Vc1[3] = vec_mul(VbS0, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]); + Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]); + Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]); + Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]); + Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]); + Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]); + Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]); + Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]); + Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]); + Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]); + Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]); + Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]); + + VbS0 = vec_splat(Vb[4], 2); + VbS1 = vec_splat(Vb[4], 3); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[5], 2); + VbS5 = vec_splat(Vb[5], 3); + + Vc2[0] = vec_mul(VbS0, Vc2[0]); + Vc2[1] = vec_mul(VbS0, Vc2[1]); + Vc2[2] = vec_mul(VbS0, Vc2[2]); + Vc2[3] = vec_mul(VbS0, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]); + Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]); + Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]); + Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]); + Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]); + Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]); + Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]); + Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]); + Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]); + Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]); + + VbS0 = vec_splat(Vb[6], 3); + VbS1 = vec_splat(Vb[7], 0); + VbS2 = vec_splat(Vb[7], 1); + VbS3 = vec_splat(Vb[7], 2); + VbS4 = vec_splat(Vb[7], 3); + + Vc3[0] = vec_mul(VbS0, Vc3[0]); + Vc3[1] = vec_mul(VbS0, Vc3[1]); + Vc3[2] = vec_mul(VbS0, Vc3[2]); + Vc3[3] = vec_mul(VbS0, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]); + Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]); + Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]); + Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]); + Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]); + Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]); + Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]); + Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]); + + VbS0 = vec_splat(Vb[9], 0); + VbS1 = vec_splat(Vb[9], 1); + VbS2 = vec_splat(Vb[9], 2); + VbS3 = vec_splat(Vb[9], 3); + + Vc4[0] = vec_mul(VbS0, Vc4[0]); + Vc4[1] = vec_mul(VbS0, Vc4[1]); + Vc4[2] = vec_mul(VbS0, Vc4[2]); + Vc4[3] = vec_mul(VbS0, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]); + Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]); + Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]); + Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]); + Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]); + Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]); + + VbS0 = vec_splat(Vb[11], 1); + VbS1 = vec_splat(Vb[11], 2); + VbS2 = vec_splat(Vb[11], 3); + + Vc5[0] = vec_mul(VbS0, Vc5[0]); + Vc5[1] = vec_mul(VbS0, Vc5[1]); + Vc5[2] = vec_mul(VbS0, Vc5[2]); + Vc5[3] = vec_mul(VbS0, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]); + Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]); + Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]); + Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]); + + VbS0 = vec_splat(Vb[13], 2); + VbS1 = vec_splat(Vb[13], 3); + + Vc6[0] = vec_mul(VbS0, Vc6[0]); + Vc6[1] = vec_mul(VbS0, Vc6[1]); + Vc6[2] = vec_mul(VbS0, Vc6[2]); + Vc6[3] = vec_mul(VbS0, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]); + + VbS0 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS0, Vc7[0]); + Vc7[1] = vec_mul(VbS0, Vc7[1]); + Vc7[2] = vec_mul(VbS0, Vc7[2]); + Vc7[3] = vec_mul(VbS0, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c new file mode 100644 index 000000000..529590f37 --- /dev/null +++ b/kernel/power/trsm_kernel_RT_power10.c @@ -0,0 +1,855 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]); + Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]); + Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]); + Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]); + Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[4], 0); + Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]); + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS7, Vc7[0]); + Vc7[1] = vec_mul(VbS7, Vc7[1]); + Vc7[2] = vec_mul(VbS7, Vc7[2]); + Vc7[3] = vec_mul(VbS7, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + + Vc6[0] = vec_mul(VbS6, Vc6[0]); + Vc6[1] = vec_mul(VbS6, Vc6[1]); + Vc6[2] = vec_mul(VbS6, Vc6[2]); + Vc6[3] = vec_mul(VbS6, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + + Vc5[0] = vec_mul(VbS5, Vc5[0]); + Vc5[1] = vec_mul(VbS5, Vc5[1]); + Vc5[2] = vec_mul(VbS5, Vc5[2]); + Vc5[3] = vec_mul(VbS5, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + + Vc4[0] = vec_mul(VbS4, Vc4[0]); + Vc4[1] = vec_mul(VbS4, Vc4[1]); + Vc4[2] = vec_mul(VbS4, Vc4[2]); + Vc4[3] = vec_mul(VbS4, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + + Vc3[0] = vec_mul(VbS3, Vc3[0]); + Vc3[1] = vec_mul(VbS3, Vc3[1]); + Vc3[2] = vec_mul(VbS3, Vc3[2]); + Vc3[3] = vec_mul(VbS3, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + + Vc2[0] = vec_mul(VbS2, Vc2[0]); + Vc2[1] = vec_mul(VbS2, Vc2[1]); + Vc2[2] = vec_mul(VbS2, Vc2[2]); + Vc2[3] = vec_mul(VbS2, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + + Vc1[0] = vec_mul(VbS1, Vc1[0]); + Vc1[1] = vec_mul(VbS1, Vc1[1]); + Vc1[2] = vec_mul(VbS1, Vc1[2]); + Vc1[3] = vec_mul(VbS1, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + + VbS0 = vec_splat(Vb[0], 0); + + Vc0[0] = vec_mul(VbS0, Vc0[0]); + Vc0[1] = vec_mul(VbS0, Vc0[1]); + Vc0[2] = vec_mul(VbS0, Vc0[2]); + Vc0[3] = vec_mul(VbS0, Vc0[3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + From 65de6f5957f9940ed338c1fdef251dbad70eb908 Mon Sep 17 00:00:00 2001 From: Jin Bo Date: Sat, 5 Dec 2020 15:06:12 +0800 Subject: [PATCH 098/121] Fix test errors reported by cblas_cgemm & cblas_ctrmm The file cgemm_kernel_8x4_msa.c holds the MSA optimization codes of cblas_cgemm and cblas_ctrmm. It defines two macros: CGEMM_SCALE_1X2 and CGEMM_TRMM_SCALE_1X2. The pc1 array index in the two macros should be 0 and 1. --- kernel/mips/cgemm_kernel_8x4_msa.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index 4b3637c7c..8b624be88 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -758,10 +758,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pc0[1] += alphar * res1; \ pc0[1] += alphai * res0; \ \ - pc1[2] += alphar * res2; \ - pc1[2] -= alphai * res3; \ - pc1[3] += alphar * res3; \ - pc1[3] += alphai * res2; \ + pc1[0] += alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] += alphar * res3; \ + pc1[1] += alphai * res2; \ } #define CGEMM_SCALE_1X1 \ @@ -1067,10 +1067,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pc0[1] = alphar * res1; \ pc0[1] += alphai * res0; \ \ - pc1[2] = alphar * res2; \ - pc1[2] -= alphai * res3; \ - pc1[3] = alphar * res3; \ - pc1[3] += alphai * res2; \ + pc1[0] = alphar * res2; \ + pc1[0] -= alphai * res3; \ + pc1[1] = alphar * res3; \ + pc1[1] += alphai * res2; \ } #define CGEMM_TRMM_SCALE_1X1 \ From 04fa17322c09c497ad8f69ab12ec8684a0847c60 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:05:27 +0100 Subject: [PATCH 099/121] Fix build options for SolarisStudio compilers --- Makefile.sparc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Makefile.sparc b/Makefile.sparc index 8895b96dd..61c7aa36d 100644 --- a/Makefile.sparc +++ b/Makefile.sparc @@ -3,21 +3,29 @@ RANLIB = ranlib ifdef BINARY64 +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mcpu=v9 -m64 +else +CCOMMON_OPT += -m64 +endif ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mcpu=v9 -m64 endif -ifeq ($(COMPILER_F77), f90) -FCOMMON_OPT += -xarch=v9 +ifeq ($(COMPILER_F77), f95) +FCOMMON_OPT += -m64 endif else +ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mcpu=v9 +else +CCOMMON_OPT += -xarch=v9 +endif ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mcpu=v9 endif -ifeq ($(COMPILER_F77), f90) +ifeq ($(COMPILER_F77), f95) FCOMMON_OPT += -xarch=v8plusb endif @@ -37,4 +45,4 @@ LIBSUNPERF = -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \ else LIBSUNPERF = -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \ -Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath -endif \ No newline at end of file +endif From da6d5d675c3db0cfd4926704a9b72f89dc4963b8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:07:45 +0100 Subject: [PATCH 100/121] Fix hostarch detection for sparc --- c_check | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/c_check b/c_check index a841df153..fe9c53f0e 100644 --- a/c_check +++ b/c_check @@ -6,7 +6,8 @@ # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); -$hostarch = `uname -p` if ($hostos eq "AIX"); +$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); +chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); From 3a1b1b7c8cc7081155a1f0d9411c9d68ab7559fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:08:43 +0100 Subject: [PATCH 101/121] Fix complex ABI for 32bit SolarisStudio builds --- common_sparc.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common_sparc.h b/common_sparc.h index 85e29fffa..90a24ebf1 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -78,6 +78,12 @@ static __inline unsigned long rpcc(void){ #define __BIG_ENDIAN__ #endif +#ifdef C_SUN +#ifndef __64BIT +#define RETURN_BY_STACK +#endif +#endif + #ifdef DOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res) : : "memory") #else From b0b14f4e9ba13331ab484010b7150495dccb8e83 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:12:02 +0100 Subject: [PATCH 102/121] Change comments to C style for compatibility --- param.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/param.h b/param.h index ee5ad17fb..a0d45c573 100644 --- a/param.h +++ b/param.h @@ -1454,22 +1454,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_R sgemm_r -//#define SGEMM_DEFAULT_R 1024 +/*#define SGEMM_DEFAULT_R 1024*/ #define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_R dgemm_r -//#define DGEMM_DEFAULT_R 1024 +/*#define DGEMM_DEFAULT_R 1024*/ #define QGEMM_DEFAULT_P 504 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_P 768 #define CGEMM_DEFAULT_R cgemm_r -//#define CGEMM_DEFAULT_R 1024 +/*#define CGEMM_DEFAULT_R 1024*/ #define ZGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_R zgemm_r -//#define ZGEMM_DEFAULT_R 1024 +/*#define ZGEMM_DEFAULT_R 1024*/ #define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_R xgemm_r @@ -2571,7 +2571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef LOONGSON3A -////Copy from SICORTEX +/*Copy from SICORTEX*/ #define SNUMOPT 2 #define DNUMOPT 2 @@ -2863,7 +2863,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -// Common ARMv8 parameters +/* Common ARMv8 parameters */ #if defined(ARMV8) #define SNUMOPT 2 @@ -3066,7 +3066,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#else // Other/undetected ARMv8 cores +#else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3095,9 +3095,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#endif // Cores +#endif /* Cores */ -#endif // ARMv8 +#endif /* ARMv8 */ #if defined(ARMV5) #define SNUMOPT 2 From 93473174d6f59b989f36ae0ce6994d347d9c33bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:12:56 +0100 Subject: [PATCH 103/121] Fix utest build with SolarisStudio compilers --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index 1fc30d088..fad3607de 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -35,6 +35,9 @@ endif ifeq ($(C_COMPILER), PGI) OBJS = utest_main2.o endif +ifeq ($(C_COMPILER), SUN) +OBJS = utest_main2.o +endif ifeq ($(OSNAME), AIX) OBJS = utest_main2.o endif From f8346603cf1794826cc2b04cd4708bb890f805b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:14:16 +0100 Subject: [PATCH 104/121] Fix compilation with SolarisStudio --- kernel/arm/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 73ae3acd7..9249b54f8 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -#if !defined(__PPC__) +#if !defined(__PPC__) && !defined(__SunOS) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; #else @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__PPC__) +#if !defined(__PPC__) && !defined(__SunOS) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else From b660008c7ef479d83f329e1aefbcf3dbed1653a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:15:37 +0100 Subject: [PATCH 105/121] Work around DOT and SWAP test failures --- kernel/sparc/KERNEL.sparc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc index 2e8319ce5..1a2e9671a 100644 --- a/kernel/sparc/KERNEL.sparc +++ b/kernel/sparc/KERNEL.sparc @@ -54,3 +54,13 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + + +SDOTKERNEL = ../generic/dot.c +SDSDOTKERNEL = ../generic/dot.c +DSDOTKERNEL = ../generic/dot.c +DDOTKERNEL = ../generic/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c From 6c7d557a166aaad44be389acb0ef6bf73935cdc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Dec 2020 19:20:50 +0100 Subject: [PATCH 106/121] Fix compiler options for 32 and 64bit SPARC builds with SolarisStudio --- Makefile.system | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index b5974f872..c17cd3bd1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1131,16 +1131,25 @@ CCOMMON_OPT += -w ifeq ($(ARCH), x86) CCOMMON_OPT += -m32 else -FCOMMON_OPT += -m64 +ifdef BINARY64 +CCOMMON_OPT += -m64 +else +CCOMMON_OPT += -m32 +endif endif endif ifeq ($(F_COMPILER), SUN) CCOMMON_OPT += -DF_INTERFACE_SUN +FCOMMON_OPT += -ftrap=%none -xrecursive ifeq ($(ARCH), x86) FCOMMON_OPT += -m32 else +ifdef BINARY64 FCOMMON_OPT += -m64 +else +FCOMMON_OPT += -m32 +endif endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -xopenmp=parallel @@ -1313,8 +1322,10 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) ifneq ($(C_COMPILER), PGI) +ifneq ($(C_COMPILER), SUN) CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME endif +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) From 47b639cc9b4ff900f7b83751af9d1c4ff9dea3c1 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 7 Dec 2020 10:04:00 +0800 Subject: [PATCH 107/121] Fix failed sswap and dswap case by using msa optimization The swap test case will call sswap_msa.c and dswap_msa.c files in MIPS environmnet. When inc_x or inc_y is equal to zero, the calculation result of the two functions will be wrong. This patch adds the processing of inc_x or inc_y equal to zero, and the swap test case has passed. --- kernel/mips/dswap_msa.c | 30 ++++++++++++++++++++++++++++-- kernel/mips/sswap_msa.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c index 7b1f02477..67e97f710 100644 --- a/kernel/mips/dswap_msa.c +++ b/kernel/mips/dswap_msa.c @@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c index 46fa8aa87..d412285b0 100644 --- a/kernel/mips/sswap_msa.c +++ b/kernel/mips/sswap_msa.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } From ad38bd0e89c4507476f1ad4ba566d27bb0dd6f9d Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 7 Dec 2020 10:18:51 +0800 Subject: [PATCH 108/121] Fix failed cgemv and zgemv test case after using msa optimization The cgemv and zgemv test case will call cgemv_n/t_msa.c zgemv_n/t_msa.c files in MIPS environment. When the macro CONJ is defined, the calculation result will be wrong due to the wrong definition of OP2. This patch updates the value of OP2 and passes the corresponding test. --- kernel/mips/cgemv_n_msa.c | 4 ++-- kernel/mips/cgemv_t_msa.c | 26 +++++++++++++++++++------- kernel/mips/zgemv_n_msa.c | 4 ++-- kernel/mips/zgemv_t_msa.c | 26 +++++++++++++++++++------- 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c index 12fa7ca02..c1eb9bbfd 100644 --- a/kernel/mips/cgemv_n_msa.c +++ b/kernel/mips/cgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c index 584e3de75..800667b6e 100644 --- a/kernel/mips/cgemv_t_msa.c +++ b/kernel/mips/cgemv_t_msa.c @@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP1 #undef OP2 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define CGEMV_T_8x4() \ diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c index 669c25758..97a80b4ba 100644 --- a/kernel/mips/zgemv_n_msa.c +++ b/kernel/mips/zgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c index e6febb577..6492f90be 100644 --- a/kernel/mips/zgemv_t_msa.c +++ b/kernel/mips/zgemv_t_msa.c @@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP3 #undef OP4 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define ZGEMV_T_8x1() \ From 7834c10e2f6288d0c7fe339375540ebe765f7efc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 7 Dec 2020 16:55:05 +0800 Subject: [PATCH 109/121] Add PingTouGe contribution credit. --- CONTRIBUTORS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7b994885a..be9a32a7c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -190,4 +190,7 @@ In chronological order: * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support * Danfeng Zhang - * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file + * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 + +* PingTouGe Semiconductor Co., Ltd. + * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 From d67babf34536ffd0cba4142aa1ea4496394438cd Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 8 Dec 2020 19:16:39 +0800 Subject: [PATCH 110/121] Remove gcc unrecognized option '-msched-weight' when check msa --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index fe9c53f0e..970d475d7 100644 --- a/c_check +++ b/c_check @@ -199,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } else { $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; From 5d26223f4a91e14ec711168f6e4a40f21729be38 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Dec 2020 20:59:56 +0100 Subject: [PATCH 111/121] remove extra/intermediate size step of min_jj from PR747 --- driver/level3/level3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index a38506585..9b44deb85 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; +/* + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif From a5547124393a3ea7538998e98356cb052dc652d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Dec 2020 21:01:36 +0100 Subject: [PATCH 112/121] remove extra/intermediate size step for min_jj introduced in PR747 --- driver/level3/level3_thread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6e1fd9e99..2b33c9589 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else +/* if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif /* Copy part of local region of B into workspace */ From d71fe4ed4eff491a9e6aae87fbd46cf9d2914d9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Dec 2020 21:07:57 +0100 Subject: [PATCH 113/121] Remove GEMM_DEFAULT_UNROLL_MN parameters for Haswell and ZEN (introduced in PR747) --- param.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index a0d45c573..42f63b4b5 100644 --- a/param.h +++ b/param.h @@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 From 4b548857d64e6f0fb3aefbd0bd5bd4d14f2a22d7 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 26 Nov 2020 14:59:41 +0800 Subject: [PATCH 114/121] Add msa support for loongson 1. Using core loongson3r3 and loongson3r4 for loongson 2. Add DYNAMIC_ARCH for loongson Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1 --- Makefile.system | 27 +- common_linux.h | 8 - common_mips64.h | 9 +- cpuid_mips64.c | 91 +++---- driver/others/Makefile | 8 + driver/others/blas_server.c | 2 + driver/others/dynamic_mips64.c | 230 ++++++++++++++++++ driver/others/parameter.c | 16 +- getarch.c | 24 +- kernel/Makefile | 5 + kernel/Makefile.L3 | 4 - kernel/mips/cgemm_kernel_8x4_msa.c | 4 +- kernel/mips/crot_msa.c | 6 +- kernel/mips/cscal_msa.c | 6 +- kernel/mips/dscal_msa.c | 4 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 38 +-- kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 36 +-- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 21 +- kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 21 +- kernel/mips/macros_msa.h | 8 +- kernel/mips/srot_msa.c | 6 +- kernel/mips/sscal_msa.c | 6 +- kernel/mips/zscal_msa.c | 8 +- kernel/mips64/KERNEL.LOONGSON3B | 64 ----- .../{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} | 27 +- kernel/mips64/KERNEL.LOONGSON3R4 | 192 +++++++++++++++ kernel/setparam-ref.c | 72 ++++++ param.h | 48 ++-- 28 files changed, 656 insertions(+), 335 deletions(-) create mode 100644 driver/others/dynamic_mips64.c delete mode 100644 kernel/mips64/KERNEL.LOONGSON3B rename kernel/mips64/{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} (75%) create mode 100644 kernel/mips64/KERNEL.LOONGSON3R4 diff --git a/Makefile.system b/Makefile.system index c17cd3bd1..6377f66ea 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 endif +ifeq ($(ARCH), mips64) +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 +endif + ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC @@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 -endif - -ifeq ($(CORE), LOONGSON3B) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 +ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +CCOMMON_OPT += -march=loongson3a +FCOMMON_OPT += -march=loongson3a endif ifeq ($(CORE), MIPS24K) @@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) FCOMMON_OPT += -loongson3 -static endif @@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) CCOMMON_OPT += -loongson3 -static endif @@ -1223,10 +1222,8 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) -ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif -endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -1342,11 +1339,9 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) -ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif -endif ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) diff --git a/common_linux.h b/common_linux.h index 35f3fb658..5a1c4e150 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) -#if defined (__64BIT__) - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -#else - return 0; //NULL Implementation on Loongson 3B 32bit. -#endif -#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif -#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_mips64.h b/common_mips64.h index a06edfe08..287459e7d 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -229,12 +229,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 21) -#if defined(LOONGSON3A) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) -#endif - -#if defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif @@ -250,7 +245,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0c19ac1e7..674b65908 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3R3 2 +#define CPU_LOONGSON3R4 3 +#define CPU_I6400 4 +#define CPU_P6600 5 +#define CPU_I6500 6 static char *cpuname[] = { "UNKNOWN", "SICORTEX", - "LOONGSON3A", - "LOONGSON3B", + "LOONGSON3R3", + "LOONGSON3R4", "I6400", "P6600", "I6500" @@ -90,48 +90,13 @@ static char *cpuname[] = { int detect(void){ -#ifdef __linux +#ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("cpu", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ - return CPU_SICORTEX; - } - } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; @@ -140,14 +105,16 @@ int detect(void){ } fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; } #endif return CPU_UNKNOWN; + } } char *get_corename(void){ @@ -159,10 +126,10 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_LOONGSON3R3) { + printf("LOONGSON3R3"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("LOONGSON3R4"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ @@ -179,8 +146,8 @@ void get_subdirname(void){ } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("#define LOONGSON3R3\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -188,8 +155,8 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("#define LOONGSON3R4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -237,10 +204,10 @@ void get_cpuconfig(void){ } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("loongson3r3\n"); + }else if(detect()==CPU_LOONGSON3R4) { + printf("loongson3r4\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { diff --git a/driver/others/Makefile b/driver/others/Makefile index d09444f56..4a421ef31 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -24,10 +24,14 @@ else ifeq ($(ARCH),zarch) COMMONOBJS += dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +COMMONOBJS += dynamic_mips64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -92,10 +96,14 @@ else ifeq ($(ARCH),zarch) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 30e0cc6c2..5e0943c2e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c new file mode 100644 index 000000000..9fd19d739 --- /dev/null +++ b/driver/others/dynamic_mips64.c @@ -0,0 +1,230 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 2 + +static char *corename[] = { + "loongson3r3", + "loongson3r4", + "UNKNOWN" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_LOONGSON3R3); + case 1: return (&gotoblas_LOONGSON3R4); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +#define MMI_MASK 0x00000010 +#define MSA_MASK 0x00000020 + +int fd[2]; +int support_cpucfg; + +static void handler(int signum) +{ + close(fd[1]); + exit(1); +} + +/* Brief : Function to check if cpucfg supported on loongson + * Return: 1 supported + * 0 not supported + */ +static int cpucfg_test(void) { + pid_t pid; + int status = 0; + + support_cpucfg = 0; + pipe(fd); + pid = fork(); + if (pid == 0) { /* Subprocess */ + struct sigaction act; + close(fd[0]); + /* Set signal action for SIGILL. */ + act.sa_handler = handler; + sigaction(SIGILL,&act,NULL); + + /* Execute cpucfg in subprocess. */ + __asm__ volatile( + ".insn \n\t" + ".word (0xc8080118) \n\t" + ::: + ); + support_cpucfg = 1; + write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); + close(fd[1]); + exit(0); + } else if (pid > 0){ /* Parent process*/ + close(fd[1]); + if ((waitpid(pid,&status,0) <= 0) || + (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) + support_cpucfg = 0; + close(fd[0]); + } else { + support_cpucfg = 0; + } + + return support_cpucfg; +} + +static gotoblas_t *get_coretype_from_cpucfg(void) { + int flag = 0; + __asm__ volatile( + ".insn \n\t" + "dli $8, 0x01 \n\t" + ".word (0xc9084918) \n\t" + "usw $9, 0x00(%0) \n\t" + : + : "r"(&flag) + : "memory" + ); + if (flag & MSA_MASK) + return (&gotoblas_LOONGSON3R4); + if (flag & MMI_MASK) + return (&gotoblas_LOONGSON3R3); + return NULL; +} + +static gotoblas_t *get_coretype_from_cpuinfo(void) { +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) + return (&gotoblas_LOONGSON3R3); + else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) + return (&gotoblas_LOONGSON3R4); + else + return NULL; + } +#endif + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int ret = 0; + + ret = cpucfg_test(); + if (ret == 1) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +} + +void gotoblas_dynamic_init(void) { + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_LOONGSON3R3; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 35fc0a253..36da13369 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -717,7 +717,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -731,20 +731,6 @@ void blas_set_parameter(void){ #endif #endif -#if defined(LOONGSON3B) -#ifdef SMP - if(blas_num_threads == 1 || blas_num_threads == 2){ -#endif - //single thread - dgemm_r = 640; -#ifdef SMP - }else{ - //multi thread - dgemm_r = 160; - } -#endif -#endif - } #endif diff --git a/getarch.c b/getarch.c index 9344defb5..e59a4e9b7 100644 --- a/getarch.c +++ b/getarch.c @@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3A +#ifdef FORCE_LOONGSON3R3 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3A" +#define SUBARCHITECTURE "LOONGSON3R3" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3A " \ +#define ARCHCONFIG "-DLOONGSON3R3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3a" -#define CORENAME "LOONGSON3A" +#define LIBNAME "loongson3r3" +#define CORENAME "LOONGSON3R3" #else #endif -#ifdef FORCE_LOONGSON3B +#ifdef FORCE_LOONGSON3R4 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3B" +#define SUBARCHITECTURE "LOONGSON3R4" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3B " \ +#define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3b" -#define CORENAME "LOONGSON3B" +#define LIBNAME "loongson3r4" +#define CORENAME "LOONGSON3R4" #else #endif diff --git a/kernel/Makefile b/kernel/Makefile index fb1d5d39a..4e86546b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif @@ -68,6 +70,9 @@ else TARGET_CORE = $(CORE) KDIR = TSUFFIX = +ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += $(MSA_FLAGS) +endif endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 893713769..d8d739965 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) -USE_TRMM = 1 -endif - ifneq ($(DYNAMIC_ARCH), 1) ifeq ($(TARGET), GENERIC) USE_TRMM = 1 diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index 8b624be88..aa3f1dcfa 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c index 5273e38a3..84eb54d6d 100644 --- a/kernel/mips/crot_msa.c +++ b/kernel/mips/crot_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 2 elements */ for (j = (n >> 1); j--;) diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c index 11a1450cf..451d0c921 100644 --- a/kernel/mips/cscal_msa.c +++ b/kernel/mips/cscal_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 6ce0375ab..2e41d8bef 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9fb5141ca..e2cd3aa4b 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); - src_a54 = __msa_cast_to_vector_double(*(a + 54)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); - src_a36 = __msa_cast_to_vector_double(*(a + 36)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); - src_a18 = __msa_cast_to_vector_double(*(a + 18)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); - src_a54 = __msa_cast_to_vector_double(*(a - 10)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); - src_a36 = __msa_cast_to_vector_double(*(a - 28)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); - src_a18 = __msa_cast_to_vector_double(*(a - 46)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); - src_a0 = __msa_cast_to_vector_double(*(a - 64)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 525fc8585..74cc1278a 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index cb361c511..03036f1c7 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) } } - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 581a90f71..4c55a0f37 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ee0dea0b7..b887800ed 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ - v4f32 out; \ - out = __msa_cast_to_vector_float(a); \ - out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + v4f32 out = {a, a, a, a}; \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ - v2f64 out; \ - out = __msa_cast_to_vector_double(a); \ - out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + v2f64 out = {a, a}; \ out; \ } ) diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c index 75730241a..79d921b7a 100644 --- a/kernel/mips/srot_msa.c +++ b/kernel/mips/srot_msa.c @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 4 floats */ for (j = (n >> 2); j--;) diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 64b62d659..66e17b844 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 6); i--;) { diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c index 5a8766d3c..a45c3cecd 100644 --- a/kernel/mips/zscal_msa.c +++ b/kernel/mips/zscal_msa.c @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B deleted file mode 100644 index e476c631e..000000000 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ /dev/null @@ -1,64 +0,0 @@ -SAXPYKERNEL=axpy_loongson3a.S -DAXPYKERNEL=daxpy_loongson3a_simd.S - -SGEMVNKERNEL = gemv_n_loongson3a.c -SGEMVTKERNEL = gemv_t_loongson3a.c -DGEMVNKERNEL = gemv_n_loongson3a.c -DGEMVTKERNEL = gemv_t_loongson3a.c -CGEMVNKERNEL = zgemv_n_loongson3a.c -CGEMVTKERNEL = zgemv_t_loongson3a.c -ZGEMVNKERNEL = zgemv_n_loongson3a.c -ZGEMVTKERNEL = zgemv_t_loongson3a.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3 similarity index 75% rename from kernel/mips64/KERNEL.LOONGSON3A rename to kernel/mips64/KERNEL.LOONGSON3R3 index 0298faaad..904828d57 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3R3 @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DSDOTKERNEL = ../mips/dot.c - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 new file mode 100644 index 000000000..b81e5441d --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -0,0 +1,192 @@ +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else +SAXPYKERNEL = axpy_loongson3a.S +DAXPYKERNEL = daxpy_loongson3a_simd.S +endif + +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +endif + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +endif +DSDOTKERNEL = ../mips/dot.c + +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +endif + +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +endif + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c +endif + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +endif + +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap_msa.c +DSWAPKERNEL = ../mips/dswap_msa.c +CSWAPKERNEL = ../mips/cswap_msa.c +ZSWAPKERNEL = ../mips/zswap_msa.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d0317a745..1e846a61c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -933,6 +933,77 @@ static void init_parameter(void) { } #else // (ARCH_ARM64) +#if defined(ARCH_MIPS64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = 640; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif +} +#else // (ARCH_MIPS64) #if (ARCH_POWER) static void init_parameter(void) { @@ -1780,4 +1851,5 @@ static void init_parameter(void) { } #endif //POWER #endif //ZARCH +#endif //(ARCH_MIPS64) #endif //(ARCH_ARM64) diff --git a/param.h b/param.h index a0d45c573..6946c2b41 100644 --- a/param.h +++ b/param.h @@ -2570,8 +2570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A -/*Copy from SICORTEX*/ +#if defined(LOONGSON3R4) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2579,6 +2578,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2590,6 +2602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#endif #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 44 @@ -2612,7 +2625,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3B +#if defined(LOONGSON3R3) +////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 @@ -2620,32 +2634,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 24 -#define CGEMM_DEFAULT_P 24 -#define ZGEMM_DEFAULT_P 20 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 92 #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 64 +#define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 -#define ZGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 From be24c66a7c3b746dd9c27db09e4b0e28785025f2 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 10 Dec 2020 10:48:53 +0800 Subject: [PATCH 115/121] Keep LOONGSON3A and LOONGSON3B for loongson --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index e59a4e9b7..29671736e 100644 --- a/getarch.c +++ b/getarch.c @@ -814,7 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3R3 +#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "LOONGSON3R3" From 346e30a46a4758eb4d9b8e5783c0b9c3c6b3ce6f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 10 Dec 2020 11:51:42 -0600 Subject: [PATCH 116/121] POWER10: Improve axpy performance This patch aligns the stores to 32 byte boundary for saxpy and daxpy before entering into vector pair loop. Fox caxpy, changed the store instructions to stxv to improve performance of unaligned cases. --- kernel/power/caxpy_microk_power10.c | 24 ++++++++++++++++-------- kernel/power/daxpy_power10.c | 17 ++++++++++++----- kernel/power/saxpy_power10.c | 14 ++++++++++---- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 0d13416b3..56a5ab47a 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); + + i += n1; - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); - - i = n1; while(i < n) { diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 8c7c22390..4a13c1f88 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -64; - + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; if ( n1 ) - saxpy_kernel_64(n1, x, y, da); + saxpy_kernel_64(n1, &x[i], &y[i], da); - i = n1; + i += n1; while(i < n) { From 6232237dba7bdd7e185216f7bb0d733ba4c0486e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 11 Dec 2020 23:41:17 +0100 Subject: [PATCH 117/121] Make fallback from P10 to P9 conditional on suitable compiler --- driver/others/dynamic_power.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index d60ae68fc..a2f56d839 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -53,8 +53,10 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; +#endif return NULL; } From 77edf82c7faf9af1412b0f0c9de7a7543341b2e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 01:25:20 +0100 Subject: [PATCH 118/121] Update Changelog.txt for 0.3.13 --- Changelog.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index edd3563ec..807c5ff20 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,54 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.13 + 12-Dec-2020 + + common: + * Added a generic bfloat16 SBGEMV kernel + * Fixed a potentially severe memory leak after fork in OpenMP builds + that was introduces in 0.3.12 + * Added detection of the Fujitsu Fortran compiler + * Added detection of the (e)gfortran compiler on OpenBSD + * Added support for overriding the default name of the library independently + from symbol suffixing in the gmake builds (already supported in cmake) + +RISCV: + * Added a RISC V port optimized for C910V + +POWER: + * Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N + * Improved DGEMM performance on POWER10 + * Improved STRSM and DTRSM performance on POWER9 and POWER10 + * Fixed segmemtation faults in DYNAMIC_ARCH builds + * Fixed compilation with the PGI compiler + +x86: + * Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12 + +x86_64: + * Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake + * Improved the performance of SASUM and DASUM kernels through parallelization + * Improved the performance of SROT and DROT kernels + * Improved the performance of multithreaded xSYRK + * Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran + (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or + wrong results) + * Fixed miscompilations by old gcc 4.6 + * Fixed misdetection of AVX2 capability in some Sandybridge cpus + * Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD + +ARM64: + * Fixed segmemtation faults in DYNAMIC_ARCH builds + +MIPS: + * Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA + * Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV + * Added handling of zero increments in the MSA kernels for SSWAP and DSWAP + * Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only) + +SPARC: + * Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers + ==================================================================== Version 0.3.12 24-Oct-2020 From 3dec81200cdac01651681a3e36f77179a0815eb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 14:27:37 +0100 Subject: [PATCH 119/121] Update Changelog.txt Co-authored-by: h-vetinari --- Changelog.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 807c5ff20..cbc7007ac 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -6,7 +6,7 @@ Version 0.3.13 common: * Added a generic bfloat16 SBGEMV kernel * Fixed a potentially severe memory leak after fork in OpenMP builds - that was introduces in 0.3.12 + that was introduced in 0.3.12 * Added detection of the Fujitsu Fortran compiler * Added detection of the (e)gfortran compiler on OpenBSD * Added support for overriding the default name of the library independently From d3ec787f774bc678ec13f0ed87fe2f3d67af1a11 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 18:14:49 +0100 Subject: [PATCH 120/121] Update version to 0.3.13 for release --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1a0965d08..e4b82104e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.12.dev +VERSION = 0.3.13 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 7bc0e4a2e001117d7e51f0ef8ea1abc4b734d079 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 18:15:33 +0100 Subject: [PATCH 121/121] Update version to 0.3.13 for release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aeb4399e4..12730e0e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 12.dev) +set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions