- adjust interface to disable "small matrix" pathway - separate HFLOAT16 from BFLOAT16 - remove SHGEMM_UNROLL_M and SHGEMM_UNROLL_N equal conditions Related to PR#5290 Co-authored-by Martinpull/5290/head
| @@ -309,7 +309,7 @@ COMMON_PROF = -pg | |||
| # BUILD_BFLOAT16 = 1 | |||
| # If you want to enable the experimental HFLOAT16 support | |||
| # BUILD_HFLOAT16 = 1 | |||
| BUILD_HFLOAT16 = 1 | |||
| # Set the thread number threshold beyond which the job array for the threaded level3 BLAS | |||
| # will be allocated on the heap rather than the stack. (This array alone requires | |||
| @@ -1898,6 +1898,8 @@ export NO_LASX | |||
| export SBGEMM_UNROLL_M | |||
| export SBGEMM_UNROLL_N | |||
| export SHGEMM_UNROLL_M | |||
| export SHGEMM_UNROLL_N | |||
| export SGEMM_UNROLL_M | |||
| export SGEMM_UNROLL_N | |||
| export DGEMM_UNROLL_M | |||
| @@ -133,10 +133,10 @@ dll : ../$(LIBDLLNAME) | |||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | |||
| $(LIBPREFIX).def : $(GENSYM) | |||
| ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| libgoto_hpl.def : $(GENSYM) | |||
| ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ifeq ($(OSNAME), Darwin) | |||
| ifeq ($(FIXED_LIBNAME),1) | |||
| @@ -301,23 +301,23 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| aix.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| objconv.def : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : $(GENSYM) ../Makefile.system ../getarch.c | |||
| ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -3816,13 +3816,20 @@ shift | |||
| p16=$9 | |||
| shift | |||
| p17=$9 | |||
| shift | |||
| p18=$9 | |||
| if [ $p13 -eq 1 ]; then | |||
| blasobjs="$blasobjs $bfblasobjs $hfblasobjs" | |||
| cblasobjs="$cblasobjs $bfcblasobjs $hfcblasobjs" | |||
| blasobjs="$blasobjs $bfblasobjs" | |||
| cblasobjs="$cblasobjs $bfcblasobjs" | |||
| fi | |||
| if [ $p14 -eq 1 ]; then | |||
| blasobjs="$blasobjs $hfblasobjs" | |||
| cblasobjs="$cblasobjs $hfcblasobjs" | |||
| fi | |||
| if [ $p15 -eq 1 ]; then | |||
| blasobjs="$blasobjs $blasobjss" | |||
| cblasobjs="$cblasobjs $cblasobjss" | |||
| lapackobjs="$lapackobjs $lapackobjss" | |||
| @@ -3835,11 +3842,11 @@ if [ $p14 -eq 1 ]; then | |||
| lapackeobjs="$lapackeobjs $lapackeobjss" | |||
| fi | |||
| if [ $p15 -eq 1 ]; then | |||
| if [ $p16 -eq 1 ]; then | |||
| blasobjs="$blasobjs $blasobjsd" | |||
| cblasobjs="$cblasobjs $cblasobjsd" | |||
| lapackobjs="$lapackobjs $lapackobjsd" | |||
| if [ $p14 -eq 0 ]; then | |||
| if [ $p15 -eq 0 ]; then | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2ds" | |||
| fi | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2d $lapackobjs2dz" | |||
| @@ -3849,14 +3856,14 @@ if [ $p15 -eq 1 ]; then | |||
| lapackeobjs="$lapackeobjs $lapackeobjsd" | |||
| fi | |||
| if [ $p16 -eq 1 ]; then | |||
| if [ $p17 -eq 1 ]; then | |||
| blasobjs="$blasobjs $blasobjsc" | |||
| cblasobjs="$cblasobjs $cblasobjsc" | |||
| gemm3mobjs="$gemm3mobjs $gemm3mobjsc" | |||
| cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsc" | |||
| lapackobjs="$lapackobjs $lapackobjsc" | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2c $lapackobjs2zc" | |||
| if [ $p14 -eq 0 ]; then | |||
| if [ $p15 -eq 0 ]; then | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2sc" | |||
| fi | |||
| lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsc" | |||
| @@ -3865,17 +3872,17 @@ if [ $p16 -eq 1 ]; then | |||
| lapackeobjs="$lapackeobjs $lapackeobjsc" | |||
| fi | |||
| if [ $p17 -eq 1 ]; then | |||
| if [ $p18 -eq 1 ]; then | |||
| blasobjs="$blasobjs $blasobjsz" | |||
| cblasobjs="$cblasobjs $cblasobjsz" | |||
| gemm3mobjs="$gemm3mobjs $gemm3mobjsz" | |||
| cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsz" | |||
| lapackobjs="$lapackobjs $lapackobjsz" | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2z" | |||
| if [ $p16 -eq 0 ]; then | |||
| if [ $p17 -eq 0 ]; then | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2zc" | |||
| fi | |||
| if [ $p15 -eq 0 ]; then | |||
| if [ $p16 -eq 0 ]; then | |||
| lapackobjs2="$lapackobjs2 $lapackobjs2dz" | |||
| fi | |||
| lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsz" | |||
| @@ -3774,10 +3774,14 @@ use File::Basename; | |||
| my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | |||
| if ($ARGV[12] == 1) { | |||
| @blasobjs = (@blasobjs, @bfblasobjs, @hfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @bfcblasobjs, @hfcblasobjs); | |||
| @blasobjs = (@blasobjs, @bfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @bfcblasobjs); | |||
| } | |||
| if ($ARGV[13] == 1) { | |||
| @blasobjs = (@blasobjs, @hfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @hfcblasobjs); | |||
| } | |||
| if ($ARGV[14] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjss); | |||
| @cblasobjs = (@cblasobjs, @cblasobjss); | |||
| @lapackobjs = (@lapackobjs, @lapackobjss); | |||
| @@ -3789,11 +3793,11 @@ if ($ARGV[13] == 1) { | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjss); | |||
| } | |||
| if ($ARGV[14] == 1) { | |||
| if ($ARGV[15] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjsd); | |||
| @cblasobjs = (@cblasobjs, @cblasobjsd); | |||
| @lapackobjs = (@lapackobjs, @lapackobjsd); | |||
| if ($ARGV[13] == 0) { | |||
| if ($ARGV[14] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); | |||
| } | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); | |||
| @@ -3802,14 +3806,14 @@ if ($ARGV[14] == 1) { | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsd); | |||
| } | |||
| if ($ARGV[15] == 1) { | |||
| if ($ARGV[16] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjsc); | |||
| @cblasobjs = (@cblasobjs, @cblasobjsc); | |||
| @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); | |||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); | |||
| @lapackobjs = (@lapackobjs, @lapackobjsc); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); | |||
| if ($ARGV[13] == 0) { | |||
| if ($ARGV[14] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); | |||
| } | |||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); | |||
| @@ -3817,17 +3821,17 @@ if ($ARGV[15] == 1) { | |||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); | |||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsc); | |||
| } | |||
| if ($ARGV[16] == 1) { | |||
| if ($ARGV[17] == 1) { | |||
| @blasobjs = (@blasobjs, @blasobjsz); | |||
| @cblasobjs = (@cblasobjs, @cblasobjsz); | |||
| @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); | |||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); | |||
| @lapackobjs = (@lapackobjs, @lapackobjsz); | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); | |||
| if ($ARGV[15] == 0) { | |||
| if ($ARGV[16] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); | |||
| } | |||
| if ($ARGV[14] == 0) { | |||
| if ($ARGV[15] == 0) { | |||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); | |||
| } | |||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); | |||
| @@ -56,6 +56,8 @@ | |||
| #elif defined(BFLOAT16) | |||
| #define ERROR_NAME "SBGEMM " | |||
| #define GEMV BLASFUNC(sbgemv) | |||
| #elif defined(HFLOAT16) | |||
| #define ERROR_NAME "SHGEMM " | |||
| #else | |||
| #define ERROR_NAME "SGEMM " | |||
| #define GEMV BLASFUNC(sgemv) | |||
| @@ -111,7 +113,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B | |||
| #endif | |||
| }; | |||
| #if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) | |||
| #if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) &&!defined(HFLOAT16) | |||
| #define USE_SMALL_MATRIX_OPT 1 | |||
| #else | |||
| #define USE_SMALL_MATRIX_OPT 0 | |||
| @@ -219,11 +221,11 @@ static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { | |||
| static inline int get_gemm_optimal_nthreads(double MNK) { | |||
| int ncpu = num_cpu_avail(3); | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||
| return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||
| } | |||
| @@ -417,7 +419,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| PRINT_DEBUG_CNAME; | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
| #if defined(DYNAMIC_ARCH) | |||
| if (support_avx512() ) | |||
| @@ -577,7 +579,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | |||
| #endif | |||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) | |||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(HFLOAT16) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) | |||
| #if defined(ARCH_ARM64) | |||
| // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c} | |||
| // perform poorly in certain circumstances. We use the following boolean | |||
| @@ -133,14 +133,14 @@ ifeq ($(BUILD_HFLOAT16), 1) | |||
| ifndef SHGEMMKERNEL | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SHKERNELOBJS += \ | |||
| @@ -726,7 +726,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| #ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -734,7 +734,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| #endif | |||
| endif | |||
| $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| @@ -2957,14 +2957,14 @@ $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| #ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| #endif | |||
| endif | |||
| $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| @@ -246,6 +246,7 @@ ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c | |||
| @@ -253,4 +254,5 @@ SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifndef SHGEMM_BETA | |||
| SHGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| endif | |||
| @@ -210,6 +210,7 @@ DOMATCOPY_CN = omatcopy_cn_vector.c | |||
| SOMATCOPY_CN = omatcopy_cn_vector.c | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c | |||
| @@ -224,6 +225,7 @@ SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifndef SHGEMM_BETA | |||
| SHGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| endif | |||
| SAXPBYKERNEL = axpby_vector_v2.c | |||
| DAXPBYKERNEL = axpby_vector_v2.c | |||
| @@ -106,3 +106,12 @@ DASUMKERNEL = dasum.c | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||