Browse Source

fix: resolve non-RISCV host build failed issue

- adjust interface to disable "small matrix" pathway
- separate HFLOAT16 from BFLOAT16
- remove SHGEMM_UNROLL_M and SHGEMM_UNROLL_N equal conditions

Related to PR#5290
Co-authored-by Martin
pull/5290/head
Srangrang 5 months ago
parent
commit
ec14e1648c
10 changed files with 68 additions and 40 deletions
  1. +1
    -1
      Makefile.rule
  2. +2
    -0
      Makefile.system
  3. +7
    -7
      exports/Makefile
  4. +16
    -9
      exports/gensymbol
  5. +13
    -9
      exports/gensymbol.pl
  6. +8
    -6
      interface/gemm.c
  7. +8
    -8
      kernel/Makefile.L3
  8. +2
    -0
      kernel/riscv64/KERNEL.RISCV64_ZVL128B
  9. +2
    -0
      kernel/riscv64/KERNEL.RISCV64_ZVL256B
  10. +9
    -0
      kernel/x86_64/KERNEL.HASWELL

+ 1
- 1
Makefile.rule View File

@@ -309,7 +309,7 @@ COMMON_PROF = -pg
# BUILD_BFLOAT16 = 1

# If you want to enable the experimental HFLOAT16 support
# BUILD_HFLOAT16 = 1
BUILD_HFLOAT16 = 1

# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
# will be allocated on the heap rather than the stack. (This array alone requires


+ 2
- 0
Makefile.system View File

@@ -1898,6 +1898,8 @@ export NO_LASX

export SBGEMM_UNROLL_M
export SBGEMM_UNROLL_N
export SHGEMM_UNROLL_M
export SHGEMM_UNROLL_N
export SGEMM_UNROLL_M
export SGEMM_UNROLL_N
export DGEMM_UNROLL_M


+ 7
- 7
exports/Makefile View File

@@ -133,10 +133,10 @@ dll : ../$(LIBDLLNAME)
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)

$(LIBPREFIX).def : $(GENSYM)
./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)

libgoto_hpl.def : $(GENSYM)
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)

ifeq ($(OSNAME), Darwin)
ifeq ($(FIXED_LIBNAME),1)
@@ -301,23 +301,23 @@ static : ../$(LIBNAME)
rm -f goto.$(SUFFIX)

osx.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)

aix.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)

objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)

objconv.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)

test : linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest

linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c

clean ::
@rm -f *.def *.dylib __.SYMDEF* *.renamed


+ 16
- 9
exports/gensymbol View File

@@ -3816,13 +3816,20 @@ shift
p16=$9
shift
p17=$9
shift
p18=$9

if [ $p13 -eq 1 ]; then
blasobjs="$blasobjs $bfblasobjs $hfblasobjs"
cblasobjs="$cblasobjs $bfcblasobjs $hfcblasobjs"
blasobjs="$blasobjs $bfblasobjs"
cblasobjs="$cblasobjs $bfcblasobjs"
fi

if [ $p14 -eq 1 ]; then
blasobjs="$blasobjs $hfblasobjs"
cblasobjs="$cblasobjs $hfcblasobjs"
fi

if [ $p15 -eq 1 ]; then
blasobjs="$blasobjs $blasobjss"
cblasobjs="$cblasobjs $cblasobjss"
lapackobjs="$lapackobjs $lapackobjss"
@@ -3835,11 +3842,11 @@ if [ $p14 -eq 1 ]; then
lapackeobjs="$lapackeobjs $lapackeobjss"
fi

if [ $p15 -eq 1 ]; then
if [ $p16 -eq 1 ]; then
blasobjs="$blasobjs $blasobjsd"
cblasobjs="$cblasobjs $cblasobjsd"
lapackobjs="$lapackobjs $lapackobjsd"
if [ $p14 -eq 0 ]; then
if [ $p15 -eq 0 ]; then
lapackobjs2="$lapackobjs2 $lapackobjs2ds"
fi
lapackobjs2="$lapackobjs2 $lapackobjs2d $lapackobjs2dz"
@@ -3849,14 +3856,14 @@ if [ $p15 -eq 1 ]; then
lapackeobjs="$lapackeobjs $lapackeobjsd"
fi

if [ $p16 -eq 1 ]; then
if [ $p17 -eq 1 ]; then
blasobjs="$blasobjs $blasobjsc"
cblasobjs="$cblasobjs $cblasobjsc"
gemm3mobjs="$gemm3mobjs $gemm3mobjsc"
cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsc"
lapackobjs="$lapackobjs $lapackobjsc"
lapackobjs2="$lapackobjs2 $lapackobjs2c $lapackobjs2zc"
if [ $p14 -eq 0 ]; then
if [ $p15 -eq 0 ]; then
lapackobjs2="$lapackobjs2 $lapackobjs2sc"
fi
lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsc"
@@ -3865,17 +3872,17 @@ if [ $p16 -eq 1 ]; then
lapackeobjs="$lapackeobjs $lapackeobjsc"
fi

if [ $p17 -eq 1 ]; then
if [ $p18 -eq 1 ]; then
blasobjs="$blasobjs $blasobjsz"
cblasobjs="$cblasobjs $cblasobjsz"
gemm3mobjs="$gemm3mobjs $gemm3mobjsz"
cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsz"
lapackobjs="$lapackobjs $lapackobjsz"
lapackobjs2="$lapackobjs2 $lapackobjs2z"
if [ $p16 -eq 0 ]; then
if [ $p17 -eq 0 ]; then
lapackobjs2="$lapackobjs2 $lapackobjs2zc"
fi
if [ $p15 -eq 0 ]; then
if [ $p16 -eq 0 ]; then
lapackobjs2="$lapackobjs2 $lapackobjs2dz"
fi
lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsz"


+ 13
- 9
exports/gensymbol.pl View File

@@ -3774,10 +3774,14 @@ use File::Basename;
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");

if ($ARGV[12] == 1) {
@blasobjs = (@blasobjs, @bfblasobjs, @hfblasobjs);
@cblasobjs = (@cblasobjs, @bfcblasobjs, @hfcblasobjs);
@blasobjs = (@blasobjs, @bfblasobjs);
@cblasobjs = (@cblasobjs, @bfcblasobjs);
}
if ($ARGV[13] == 1) {
@blasobjs = (@blasobjs, @hfblasobjs);
@cblasobjs = (@cblasobjs, @hfcblasobjs);
}
if ($ARGV[14] == 1) {
@blasobjs = (@blasobjs, @blasobjss);
@cblasobjs = (@cblasobjs, @cblasobjss);
@lapackobjs = (@lapackobjs, @lapackobjss);
@@ -3789,11 +3793,11 @@ if ($ARGV[13] == 1) {
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s);
@lapackeobjs = (@lapackeobjs, @lapackeobjss);
}
if ($ARGV[14] == 1) {
if ($ARGV[15] == 1) {
@blasobjs = (@blasobjs, @blasobjsd);
@cblasobjs = (@cblasobjs, @cblasobjsd);
@lapackobjs = (@lapackobjs, @lapackobjsd);
if ($ARGV[13] == 0) {
if ($ARGV[14] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
}
@lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz);
@@ -3802,14 +3806,14 @@ if ($ARGV[14] == 1) {
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d);
@lapackeobjs = (@lapackeobjs, @lapackeobjsd);
}
if ($ARGV[15] == 1) {
if ($ARGV[16] == 1) {
@blasobjs = (@blasobjs, @blasobjsc);
@cblasobjs = (@cblasobjs, @cblasobjsc);
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc);
@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc);
@lapackobjs = (@lapackobjs, @lapackobjsc);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc);
if ($ARGV[13] == 0) {
if ($ARGV[14] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
}
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc);
@@ -3817,17 +3821,17 @@ if ($ARGV[15] == 1) {
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c);
@lapackeobjs = (@lapackeobjs, @lapackeobjsc);
}
if ($ARGV[16] == 1) {
if ($ARGV[17] == 1) {
@blasobjs = (@blasobjs, @blasobjsz);
@cblasobjs = (@cblasobjs, @cblasobjsz);
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz);
@cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz);
@lapackobjs = (@lapackobjs, @lapackobjsz);
@lapackobjs2 = (@lapackobjs2, @lapackobjs2z);
if ($ARGV[15] == 0) {
if ($ARGV[16] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2zc);
}
if ($ARGV[14] == 0) {
if ($ARGV[15] == 0) {
@lapackobjs2 = (@lapackobjs2, @lapackobjs2dz);
}
@lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz);


+ 8
- 6
interface/gemm.c View File

@@ -56,6 +56,8 @@
#elif defined(BFLOAT16)
#define ERROR_NAME "SBGEMM "
#define GEMV BLASFUNC(sbgemv)
#elif defined(HFLOAT16)
#define ERROR_NAME "SHGEMM "
#else
#define ERROR_NAME "SGEMM "
#define GEMV BLASFUNC(sgemv)
@@ -111,7 +113,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
#endif
};

#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE)
#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) &&!defined(HFLOAT16)
#define USE_SMALL_MATRIX_OPT 1
#else
#define USE_SMALL_MATRIX_OPT 0
@@ -219,11 +221,11 @@ static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) {

static inline int get_gemm_optimal_nthreads(double MNK) {
int ncpu = num_cpu_avail(3);
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu);
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu);
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu);
}
@@ -417,7 +419,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

PRINT_DEBUG_CNAME;

#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
@@ -577,7 +579,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
#endif

#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16))
#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(HFLOAT16) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16))
#if defined(ARCH_ARM64)
// The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c}
// perform poorly in certain circumstances. We use the following boolean


+ 8
- 8
kernel/Makefile.L3 View File

@@ -133,14 +133,14 @@ ifeq ($(BUILD_HFLOAT16), 1)
ifndef SHGEMMKERNEL
SHGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif

SHKERNELOBJS += \
@@ -726,7 +726,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
#ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))

$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
@@ -734,7 +734,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

endif
#endif
endif

$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
@@ -2957,14 +2957,14 @@ $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
#ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

endif
#endif
endif

$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)


+ 2
- 0
kernel/riscv64/KERNEL.RISCV64_ZVL128B View File

@@ -246,6 +246,7 @@ ifndef ZGEMM_BETA
ZGEMM_BETA = zgemm_beta_rvv.c
endif

ifeq ($(BUILD_BFLOAT16), 1)
SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
@@ -253,4 +254,5 @@ SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifndef SHGEMM_BETA
SHGEMM_BETA = gemm_beta_rvv.c
endif
endif

+ 2
- 0
kernel/riscv64/KERNEL.RISCV64_ZVL256B View File

@@ -210,6 +210,7 @@ DOMATCOPY_CN = omatcopy_cn_vector.c
SOMATCOPY_CN = omatcopy_cn_vector.c


ifeq ($(BUILD_BFLOAT16), 1)
SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c
@@ -224,6 +225,7 @@ SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifndef SHGEMM_BETA
SHGEMM_BETA = gemm_beta_rvv.c
endif
endif

SAXPBYKERNEL = axpby_vector_v2.c
DAXPBYKERNEL = axpby_vector_v2.c

+ 9
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -106,3 +106,12 @@ DASUMKERNEL = dasum.c

SROTKERNEL = srot.c
DROTKERNEL = drot.c


ifeq ($(BUILD_BFLOAT16), 1)
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif

Loading…
Cancel
Save