From ec14e1648cff986c3f4b5852ea94b8a1bec1b2ee Mon Sep 17 00:00:00 2001 From: Srangrang Date: Sun, 15 Jun 2025 20:25:15 +0800 Subject: [PATCH] fix: resolve non-RISCV host build failed issue - adjust interface to disable "small matrix" pathway - separate HFLOAT16 from BFLOAT16 - remove SHGEMM_UNROLL_M and SHGEMM_UNROLL_N equal conditions Related to PR#5290 Co-authored-by Martin --- Makefile.rule | 2 +- Makefile.system | 2 ++ exports/Makefile | 14 +++++++------- exports/gensymbol | 25 ++++++++++++++++--------- exports/gensymbol.pl | 22 +++++++++++++--------- interface/gemm.c | 14 ++++++++------ kernel/Makefile.L3 | 16 ++++++++-------- kernel/riscv64/KERNEL.RISCV64_ZVL128B | 2 ++ kernel/riscv64/KERNEL.RISCV64_ZVL256B | 2 ++ kernel/x86_64/KERNEL.HASWELL | 9 +++++++++ 10 files changed, 68 insertions(+), 40 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 24b34d1c2..00c7c07cc 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -309,7 +309,7 @@ COMMON_PROF = -pg # BUILD_BFLOAT16 = 1 # If you want to enable the experimental HFLOAT16 support -# BUILD_HFLOAT16 = 1 +BUILD_HFLOAT16 = 1 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS # will be allocated on the heap rather than the stack. (This array alone requires diff --git a/Makefile.system b/Makefile.system index be31d05ef..820b3aff7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1898,6 +1898,8 @@ export NO_LASX export SBGEMM_UNROLL_M export SBGEMM_UNROLL_N +export SHGEMM_UNROLL_M +export SHGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/exports/Makefile b/exports/Makefile index b4b391a19..176b1a766 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -133,10 +133,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : $(GENSYM) - ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) libgoto_hpl.def : $(GENSYM) - ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) ifeq ($(OSNAME), Darwin) ifeq ($(FIXED_LIBNAME),1) @@ -301,23 +301,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) aix.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) objconv.def : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) + ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : $(GENSYM) ../Makefile.system ../getarch.c - ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c + ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 231e72f48..3719574ea 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -3816,13 +3816,20 @@ shift p16=$9 shift p17=$9 +shift +p18=$9 if [ $p13 -eq 1 ]; then - blasobjs="$blasobjs $bfblasobjs $hfblasobjs" - cblasobjs="$cblasobjs $bfcblasobjs $hfcblasobjs" + blasobjs="$blasobjs $bfblasobjs" + cblasobjs="$cblasobjs $bfcblasobjs" fi if [ $p14 -eq 1 ]; then + blasobjs="$blasobjs $hfblasobjs" + cblasobjs="$cblasobjs $hfcblasobjs" +fi + +if [ $p15 -eq 1 ]; then blasobjs="$blasobjs $blasobjss" cblasobjs="$cblasobjs $cblasobjss" lapackobjs="$lapackobjs $lapackobjss" @@ -3835,11 +3842,11 @@ if [ $p14 -eq 1 ]; then lapackeobjs="$lapackeobjs $lapackeobjss" fi -if [ $p15 -eq 1 ]; then +if [ $p16 -eq 1 ]; then blasobjs="$blasobjs $blasobjsd" cblasobjs="$cblasobjs $cblasobjsd" lapackobjs="$lapackobjs $lapackobjsd" - if [ $p14 -eq 0 ]; then + if [ $p15 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2ds" fi lapackobjs2="$lapackobjs2 $lapackobjs2d $lapackobjs2dz" @@ -3849,14 +3856,14 @@ if [ $p15 -eq 1 ]; then lapackeobjs="$lapackeobjs $lapackeobjsd" fi -if [ $p16 -eq 1 ]; then +if [ $p17 -eq 1 ]; then blasobjs="$blasobjs $blasobjsc" cblasobjs="$cblasobjs $cblasobjsc" gemm3mobjs="$gemm3mobjs $gemm3mobjsc" cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsc" lapackobjs="$lapackobjs $lapackobjsc" lapackobjs2="$lapackobjs2 $lapackobjs2c $lapackobjs2zc" - if [ $p14 -eq 0 ]; then + if [ $p15 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2sc" fi lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsc" @@ -3865,17 +3872,17 @@ if [ $p16 -eq 1 ]; then lapackeobjs="$lapackeobjs $lapackeobjsc" fi -if [ $p17 -eq 1 ]; then +if [ $p18 -eq 1 ]; then blasobjs="$blasobjs $blasobjsz" cblasobjs="$cblasobjs $cblasobjsz" gemm3mobjs="$gemm3mobjs $gemm3mobjsz" cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsz" lapackobjs="$lapackobjs $lapackobjsz" lapackobjs2="$lapackobjs2 $lapackobjs2z" - if [ $p16 -eq 0 ]; then + if [ $p17 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2zc" fi - if [ $p15 -eq 0 ]; then + if [ $p16 -eq 0 ]; then lapackobjs2="$lapackobjs2 $lapackobjs2dz" fi lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsz" diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index 1c4e912f2..5a8423697 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -3774,10 +3774,14 @@ use File::Basename; my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); if ($ARGV[12] == 1) { - @blasobjs = (@blasobjs, @bfblasobjs, @hfblasobjs); - @cblasobjs = (@cblasobjs, @bfcblasobjs, @hfcblasobjs); + @blasobjs = (@blasobjs, @bfblasobjs); + @cblasobjs = (@cblasobjs, @bfcblasobjs); } if ($ARGV[13] == 1) { + @blasobjs = (@blasobjs, @hfblasobjs); + @cblasobjs = (@cblasobjs, @hfcblasobjs); +} +if ($ARGV[14] == 1) { @blasobjs = (@blasobjs, @blasobjss); @cblasobjs = (@cblasobjs, @cblasobjss); @lapackobjs = (@lapackobjs, @lapackobjss); @@ -3789,11 +3793,11 @@ if ($ARGV[13] == 1) { @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); @lapackeobjs = (@lapackeobjs, @lapackeobjss); } -if ($ARGV[14] == 1) { +if ($ARGV[15] == 1) { @blasobjs = (@blasobjs, @blasobjsd); @cblasobjs = (@cblasobjs, @cblasobjsd); @lapackobjs = (@lapackobjs, @lapackobjsd); - if ($ARGV[13] == 0) { + if ($ARGV[14] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); } @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); @@ -3802,14 +3806,14 @@ if ($ARGV[14] == 1) { @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); @lapackeobjs = (@lapackeobjs, @lapackeobjsd); } -if ($ARGV[15] == 1) { +if ($ARGV[16] == 1) { @blasobjs = (@blasobjs, @blasobjsc); @cblasobjs = (@cblasobjs, @cblasobjsc); @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); @lapackobjs = (@lapackobjs, @lapackobjsc); @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); - if ($ARGV[13] == 0) { + if ($ARGV[14] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); } @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); @@ -3817,17 +3821,17 @@ if ($ARGV[15] == 1) { @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); @lapackeobjs = (@lapackeobjs, @lapackeobjsc); } -if ($ARGV[16] == 1) { +if ($ARGV[17] == 1) { @blasobjs = (@blasobjs, @blasobjsz); @cblasobjs = (@cblasobjs, @cblasobjsz); @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); @lapackobjs = (@lapackobjs, @lapackobjsz); @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); - if ($ARGV[15] == 0) { + if ($ARGV[16] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); } - if ($ARGV[14] == 0) { + if ($ARGV[15] == 0) { @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); } @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); diff --git a/interface/gemm.c b/interface/gemm.c index 54e5604fd..d79282e13 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -56,6 +56,8 @@ #elif defined(BFLOAT16) #define ERROR_NAME "SBGEMM " #define GEMV BLASFUNC(sbgemv) +#elif defined(HFLOAT16) +#define ERROR_NAME "SHGEMM " #else #define ERROR_NAME "SGEMM " #define GEMV BLASFUNC(sgemv) @@ -111,7 +113,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; -#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) &&!defined(HFLOAT16) #define USE_SMALL_MATRIX_OPT 1 #else #define USE_SMALL_MATRIX_OPT 0 @@ -219,11 +221,11 @@ static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { static inline int get_gemm_optimal_nthreads(double MNK) { int ncpu = num_cpu_avail(3); -#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); -#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); -#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); } @@ -417,7 +419,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) #if defined(DYNAMIC_ARCH) if (support_avx512() ) @@ -577,7 +579,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif -#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) +#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(HFLOAT16) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) #if defined(ARCH_ARM64) // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c} // perform poorly in certain circumstances. We use the following boolean diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 71d66f8f3..6afb49a77 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -133,14 +133,14 @@ ifeq ($(BUILD_HFLOAT16), 1) ifndef SHGEMMKERNEL SHGEMM_BETA = ../generic/gemm_beta.c SHGEMMKERNEL = ../generic/gemmkernel_2x2.c -SHGEMMINCOPY = ../generic/gemm_ncopy_2.c -SHGEMMITCOPY = ../generic/gemm_tcopy_2.c SHGEMMONCOPY = ../generic/gemm_ncopy_2.c SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +SHGEMMINCOPY = ../generic/gemm_ncopy_2.c +SHGEMMITCOPY = ../generic/gemm_tcopy_2.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) endif SHKERNELOBJS += \ @@ -726,7 +726,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +#ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ @@ -734,7 +734,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif +#endif endif $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) @@ -2957,14 +2957,14 @@ $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +#ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -endif +#endif endif $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index d2a2d3578..fac233c00 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -246,6 +246,7 @@ ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta_rvv.c endif +ifeq ($(BUILD_BFLOAT16), 1) SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c @@ -253,4 +254,5 @@ SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) ifndef SHGEMM_BETA SHGEMM_BETA = gemm_beta_rvv.c +endif endif \ No newline at end of file diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 847ebff70..a41678abe 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -210,6 +210,7 @@ DOMATCOPY_CN = omatcopy_cn_vector.c SOMATCOPY_CN = omatcopy_cn_vector.c +ifeq ($(BUILD_BFLOAT16), 1) SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c @@ -224,6 +225,7 @@ SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) ifndef SHGEMM_BETA SHGEMM_BETA = gemm_beta_rvv.c endif +endif SAXPBYKERNEL = axpby_vector_v2.c DAXPBYKERNEL = axpby_vector_v2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index aaf686c9f..91962c150 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -106,3 +106,12 @@ DASUMKERNEL = dasum.c SROTKERNEL = srot.c DROTKERNEL = drot.c + + +ifeq ($(BUILD_BFLOAT16), 1) +SHGEMMKERNEL = ../generic/gemmkernel_2x2.c +SHGEMMONCOPY = ../generic/gemm_ncopy_2.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif \ No newline at end of file