- adjust interface to disable "small matrix" pathway - separate HFLOAT16 from BFLOAT16 - remove SHGEMM_UNROLL_M and SHGEMM_UNROLL_N equal conditions Related to PR#5290 Co-authored-by Martinpull/5290/head
| @@ -309,7 +309,7 @@ COMMON_PROF = -pg | |||||
| # BUILD_BFLOAT16 = 1 | # BUILD_BFLOAT16 = 1 | ||||
| # If you want to enable the experimental HFLOAT16 support | # If you want to enable the experimental HFLOAT16 support | ||||
| # BUILD_HFLOAT16 = 1 | |||||
| BUILD_HFLOAT16 = 1 | |||||
| # Set the thread number threshold beyond which the job array for the threaded level3 BLAS | # Set the thread number threshold beyond which the job array for the threaded level3 BLAS | ||||
| # will be allocated on the heap rather than the stack. (This array alone requires | # will be allocated on the heap rather than the stack. (This array alone requires | ||||
| @@ -1898,6 +1898,8 @@ export NO_LASX | |||||
| export SBGEMM_UNROLL_M | export SBGEMM_UNROLL_M | ||||
| export SBGEMM_UNROLL_N | export SBGEMM_UNROLL_N | ||||
| export SHGEMM_UNROLL_M | |||||
| export SHGEMM_UNROLL_N | |||||
| export SGEMM_UNROLL_M | export SGEMM_UNROLL_M | ||||
| export SGEMM_UNROLL_N | export SGEMM_UNROLL_N | ||||
| export DGEMM_UNROLL_M | export DGEMM_UNROLL_M | ||||
| @@ -133,10 +133,10 @@ dll : ../$(LIBDLLNAME) | |||||
| -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) | ||||
| $(LIBPREFIX).def : $(GENSYM) | $(LIBPREFIX).def : $(GENSYM) | ||||
| ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| libgoto_hpl.def : $(GENSYM) | libgoto_hpl.def : $(GENSYM) | ||||
| ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| ifeq ($(FIXED_LIBNAME),1) | ifeq ($(FIXED_LIBNAME),1) | ||||
| @@ -301,23 +301,23 @@ static : ../$(LIBNAME) | |||||
| rm -f goto.$(SUFFIX) | rm -f goto.$(SUFFIX) | ||||
| osx.def : $(GENSYM) ../Makefile.system ../getarch.c | osx.def : $(GENSYM) ../Makefile.system ../getarch.c | ||||
| ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| aix.def : $(GENSYM) ../Makefile.system ../getarch.c | aix.def : $(GENSYM) ../Makefile.system ../getarch.c | ||||
| ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c | objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c | ||||
| ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| objconv.def : $(GENSYM) ../Makefile.system ../getarch.c | objconv.def : $(GENSYM) ../Makefile.system ../getarch.c | ||||
| ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| ./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) | |||||
| test : linktest.c | test : linktest.c | ||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | ||||
| rm -f linktest | rm -f linktest | ||||
| linktest.c : $(GENSYM) ../Makefile.system ../getarch.c | linktest.c : $(GENSYM) ../Makefile.system ../getarch.c | ||||
| ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||||
| ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_HFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c | |||||
| clean :: | clean :: | ||||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | @rm -f *.def *.dylib __.SYMDEF* *.renamed | ||||
| @@ -3816,13 +3816,20 @@ shift | |||||
| p16=$9 | p16=$9 | ||||
| shift | shift | ||||
| p17=$9 | p17=$9 | ||||
| shift | |||||
| p18=$9 | |||||
| if [ $p13 -eq 1 ]; then | if [ $p13 -eq 1 ]; then | ||||
| blasobjs="$blasobjs $bfblasobjs $hfblasobjs" | |||||
| cblasobjs="$cblasobjs $bfcblasobjs $hfcblasobjs" | |||||
| blasobjs="$blasobjs $bfblasobjs" | |||||
| cblasobjs="$cblasobjs $bfcblasobjs" | |||||
| fi | fi | ||||
| if [ $p14 -eq 1 ]; then | if [ $p14 -eq 1 ]; then | ||||
| blasobjs="$blasobjs $hfblasobjs" | |||||
| cblasobjs="$cblasobjs $hfcblasobjs" | |||||
| fi | |||||
| if [ $p15 -eq 1 ]; then | |||||
| blasobjs="$blasobjs $blasobjss" | blasobjs="$blasobjs $blasobjss" | ||||
| cblasobjs="$cblasobjs $cblasobjss" | cblasobjs="$cblasobjs $cblasobjss" | ||||
| lapackobjs="$lapackobjs $lapackobjss" | lapackobjs="$lapackobjs $lapackobjss" | ||||
| @@ -3835,11 +3842,11 @@ if [ $p14 -eq 1 ]; then | |||||
| lapackeobjs="$lapackeobjs $lapackeobjss" | lapackeobjs="$lapackeobjs $lapackeobjss" | ||||
| fi | fi | ||||
| if [ $p15 -eq 1 ]; then | |||||
| if [ $p16 -eq 1 ]; then | |||||
| blasobjs="$blasobjs $blasobjsd" | blasobjs="$blasobjs $blasobjsd" | ||||
| cblasobjs="$cblasobjs $cblasobjsd" | cblasobjs="$cblasobjs $cblasobjsd" | ||||
| lapackobjs="$lapackobjs $lapackobjsd" | lapackobjs="$lapackobjs $lapackobjsd" | ||||
| if [ $p14 -eq 0 ]; then | |||||
| if [ $p15 -eq 0 ]; then | |||||
| lapackobjs2="$lapackobjs2 $lapackobjs2ds" | lapackobjs2="$lapackobjs2 $lapackobjs2ds" | ||||
| fi | fi | ||||
| lapackobjs2="$lapackobjs2 $lapackobjs2d $lapackobjs2dz" | lapackobjs2="$lapackobjs2 $lapackobjs2d $lapackobjs2dz" | ||||
| @@ -3849,14 +3856,14 @@ if [ $p15 -eq 1 ]; then | |||||
| lapackeobjs="$lapackeobjs $lapackeobjsd" | lapackeobjs="$lapackeobjs $lapackeobjsd" | ||||
| fi | fi | ||||
| if [ $p16 -eq 1 ]; then | |||||
| if [ $p17 -eq 1 ]; then | |||||
| blasobjs="$blasobjs $blasobjsc" | blasobjs="$blasobjs $blasobjsc" | ||||
| cblasobjs="$cblasobjs $cblasobjsc" | cblasobjs="$cblasobjs $cblasobjsc" | ||||
| gemm3mobjs="$gemm3mobjs $gemm3mobjsc" | gemm3mobjs="$gemm3mobjs $gemm3mobjsc" | ||||
| cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsc" | cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsc" | ||||
| lapackobjs="$lapackobjs $lapackobjsc" | lapackobjs="$lapackobjs $lapackobjsc" | ||||
| lapackobjs2="$lapackobjs2 $lapackobjs2c $lapackobjs2zc" | lapackobjs2="$lapackobjs2 $lapackobjs2c $lapackobjs2zc" | ||||
| if [ $p14 -eq 0 ]; then | |||||
| if [ $p15 -eq 0 ]; then | |||||
| lapackobjs2="$lapackobjs2 $lapackobjs2sc" | lapackobjs2="$lapackobjs2 $lapackobjs2sc" | ||||
| fi | fi | ||||
| lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsc" | lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsc" | ||||
| @@ -3865,17 +3872,17 @@ if [ $p16 -eq 1 ]; then | |||||
| lapackeobjs="$lapackeobjs $lapackeobjsc" | lapackeobjs="$lapackeobjs $lapackeobjsc" | ||||
| fi | fi | ||||
| if [ $p17 -eq 1 ]; then | |||||
| if [ $p18 -eq 1 ]; then | |||||
| blasobjs="$blasobjs $blasobjsz" | blasobjs="$blasobjs $blasobjsz" | ||||
| cblasobjs="$cblasobjs $cblasobjsz" | cblasobjs="$cblasobjs $cblasobjsz" | ||||
| gemm3mobjs="$gemm3mobjs $gemm3mobjsz" | gemm3mobjs="$gemm3mobjs $gemm3mobjsz" | ||||
| cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsz" | cblasgemm3mobjs="$cblasgemm3mobjs $cblasgemm3mobjsz" | ||||
| lapackobjs="$lapackobjs $lapackobjsz" | lapackobjs="$lapackobjs $lapackobjsz" | ||||
| lapackobjs2="$lapackobjs2 $lapackobjs2z" | lapackobjs2="$lapackobjs2 $lapackobjs2z" | ||||
| if [ $p16 -eq 0 ]; then | |||||
| if [ $p17 -eq 0 ]; then | |||||
| lapackobjs2="$lapackobjs2 $lapackobjs2zc" | lapackobjs2="$lapackobjs2 $lapackobjs2zc" | ||||
| fi | fi | ||||
| if [ $p15 -eq 0 ]; then | |||||
| if [ $p16 -eq 0 ]; then | |||||
| lapackobjs2="$lapackobjs2 $lapackobjs2dz" | lapackobjs2="$lapackobjs2 $lapackobjs2dz" | ||||
| fi | fi | ||||
| lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsz" | lapack_deprecated_objs="$lapack_deprecated_objs $lapack_deprecated_objsz" | ||||
| @@ -3774,10 +3774,14 @@ use File::Basename; | |||||
| my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | ||||
| if ($ARGV[12] == 1) { | if ($ARGV[12] == 1) { | ||||
| @blasobjs = (@blasobjs, @bfblasobjs, @hfblasobjs); | |||||
| @cblasobjs = (@cblasobjs, @bfcblasobjs, @hfcblasobjs); | |||||
| @blasobjs = (@blasobjs, @bfblasobjs); | |||||
| @cblasobjs = (@cblasobjs, @bfcblasobjs); | |||||
| } | } | ||||
| if ($ARGV[13] == 1) { | if ($ARGV[13] == 1) { | ||||
| @blasobjs = (@blasobjs, @hfblasobjs); | |||||
| @cblasobjs = (@cblasobjs, @hfcblasobjs); | |||||
| } | |||||
| if ($ARGV[14] == 1) { | |||||
| @blasobjs = (@blasobjs, @blasobjss); | @blasobjs = (@blasobjs, @blasobjss); | ||||
| @cblasobjs = (@cblasobjs, @cblasobjss); | @cblasobjs = (@cblasobjs, @cblasobjss); | ||||
| @lapackobjs = (@lapackobjs, @lapackobjss); | @lapackobjs = (@lapackobjs, @lapackobjss); | ||||
| @@ -3789,11 +3793,11 @@ if ($ARGV[13] == 1) { | |||||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); | @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); | ||||
| @lapackeobjs = (@lapackeobjs, @lapackeobjss); | @lapackeobjs = (@lapackeobjs, @lapackeobjss); | ||||
| } | } | ||||
| if ($ARGV[14] == 1) { | |||||
| if ($ARGV[15] == 1) { | |||||
| @blasobjs = (@blasobjs, @blasobjsd); | @blasobjs = (@blasobjs, @blasobjsd); | ||||
| @cblasobjs = (@cblasobjs, @cblasobjsd); | @cblasobjs = (@cblasobjs, @cblasobjsd); | ||||
| @lapackobjs = (@lapackobjs, @lapackobjsd); | @lapackobjs = (@lapackobjs, @lapackobjsd); | ||||
| if ($ARGV[13] == 0) { | |||||
| if ($ARGV[14] == 0) { | |||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds); | ||||
| } | } | ||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz); | ||||
| @@ -3802,14 +3806,14 @@ if ($ARGV[14] == 1) { | |||||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); | @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); | ||||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsd); | @lapackeobjs = (@lapackeobjs, @lapackeobjsd); | ||||
| } | } | ||||
| if ($ARGV[15] == 1) { | |||||
| if ($ARGV[16] == 1) { | |||||
| @blasobjs = (@blasobjs, @blasobjsc); | @blasobjs = (@blasobjs, @blasobjsc); | ||||
| @cblasobjs = (@cblasobjs, @cblasobjsc); | @cblasobjs = (@cblasobjs, @cblasobjsc); | ||||
| @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); | @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); | ||||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); | @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc); | ||||
| @lapackobjs = (@lapackobjs, @lapackobjsc); | @lapackobjs = (@lapackobjs, @lapackobjsc); | ||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc); | ||||
| if ($ARGV[13] == 0) { | |||||
| if ($ARGV[14] == 0) { | |||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc); | ||||
| } | } | ||||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); | @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc); | ||||
| @@ -3817,17 +3821,17 @@ if ($ARGV[15] == 1) { | |||||
| @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); | @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); | ||||
| @lapackeobjs = (@lapackeobjs, @lapackeobjsc); | @lapackeobjs = (@lapackeobjs, @lapackeobjsc); | ||||
| } | } | ||||
| if ($ARGV[16] == 1) { | |||||
| if ($ARGV[17] == 1) { | |||||
| @blasobjs = (@blasobjs, @blasobjsz); | @blasobjs = (@blasobjs, @blasobjsz); | ||||
| @cblasobjs = (@cblasobjs, @cblasobjsz); | @cblasobjs = (@cblasobjs, @cblasobjsz); | ||||
| @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); | @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); | ||||
| @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); | @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz); | ||||
| @lapackobjs = (@lapackobjs, @lapackobjsz); | @lapackobjs = (@lapackobjs, @lapackobjsz); | ||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2z); | ||||
| if ($ARGV[15] == 0) { | |||||
| if ($ARGV[16] == 0) { | |||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc); | ||||
| } | } | ||||
| if ($ARGV[14] == 0) { | |||||
| if ($ARGV[15] == 0) { | |||||
| @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); | @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz); | ||||
| } | } | ||||
| @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); | @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz); | ||||
| @@ -56,6 +56,8 @@ | |||||
| #elif defined(BFLOAT16) | #elif defined(BFLOAT16) | ||||
| #define ERROR_NAME "SBGEMM " | #define ERROR_NAME "SBGEMM " | ||||
| #define GEMV BLASFUNC(sbgemv) | #define GEMV BLASFUNC(sbgemv) | ||||
| #elif defined(HFLOAT16) | |||||
| #define ERROR_NAME "SHGEMM " | |||||
| #else | #else | ||||
| #define ERROR_NAME "SGEMM " | #define ERROR_NAME "SGEMM " | ||||
| #define GEMV BLASFUNC(sgemv) | #define GEMV BLASFUNC(sgemv) | ||||
| @@ -111,7 +113,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B | |||||
| #endif | #endif | ||||
| }; | }; | ||||
| #if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) | |||||
| #if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) &&!defined(HFLOAT16) | |||||
| #define USE_SMALL_MATRIX_OPT 1 | #define USE_SMALL_MATRIX_OPT 1 | ||||
| #else | #else | ||||
| #define USE_SMALL_MATRIX_OPT 0 | #define USE_SMALL_MATRIX_OPT 0 | ||||
| @@ -219,11 +221,11 @@ static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { | |||||
| static inline int get_gemm_optimal_nthreads(double MNK) { | static inline int get_gemm_optimal_nthreads(double MNK) { | ||||
| int ncpu = num_cpu_avail(3); | int ncpu = num_cpu_avail(3); | ||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | ||||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||||
| return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | ||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | ||||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | ||||
| } | } | ||||
| @@ -417,7 +419,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) | |||||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | ||||
| #if defined(DYNAMIC_ARCH) | #if defined(DYNAMIC_ARCH) | ||||
| if (support_avx512() ) | if (support_avx512() ) | ||||
| @@ -577,7 +579,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | ||||
| #endif | #endif | ||||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) | |||||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(HFLOAT16) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) | |||||
| #if defined(ARCH_ARM64) | #if defined(ARCH_ARM64) | ||||
| // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c} | // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c} | ||||
| // perform poorly in certain circumstances. We use the following boolean | // perform poorly in certain circumstances. We use the following boolean | ||||
| @@ -133,14 +133,14 @@ ifeq ($(BUILD_HFLOAT16), 1) | |||||
| ifndef SHGEMMKERNEL | ifndef SHGEMMKERNEL | ||||
| SHGEMM_BETA = ../generic/gemm_beta.c | SHGEMM_BETA = ../generic/gemm_beta.c | ||||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | ||||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| SHGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | ||||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | ||||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| SHGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | endif | ||||
| SHKERNELOBJS += \ | SHKERNELOBJS += \ | ||||
| @@ -726,7 +726,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||||
| $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | ||||
| $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||||
| #ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||||
| $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | ||||
| $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| @@ -734,7 +734,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||||
| $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) | $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) | ||||
| $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| endif | |||||
| #endif | |||||
| endif | endif | ||||
| $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | ||||
| @@ -2957,14 +2957,14 @@ $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||||
| $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) | $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) | ||||
| $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||||
| #ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||||
| $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) | $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) | ||||
| $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | ||||
| $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | $(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | ||||
| endif | |||||
| #endif | |||||
| endif | endif | ||||
| $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) | $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) | ||||
| @@ -246,6 +246,7 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = zgemm_beta_rvv.c | ZGEMM_BETA = zgemm_beta_rvv.c | ||||
| endif | endif | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | |||||
| SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c | SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c | ||||
| SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c | SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c | ||||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c | SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c | ||||
| @@ -253,4 +254,5 @@ SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ifndef SHGEMM_BETA | ifndef SHGEMM_BETA | ||||
| SHGEMM_BETA = gemm_beta_rvv.c | SHGEMM_BETA = gemm_beta_rvv.c | ||||
| endif | |||||
| endif | endif | ||||
| @@ -210,6 +210,7 @@ DOMATCOPY_CN = omatcopy_cn_vector.c | |||||
| SOMATCOPY_CN = omatcopy_cn_vector.c | SOMATCOPY_CN = omatcopy_cn_vector.c | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | |||||
| SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c | SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c | ||||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | ||||
| SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c | SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c | ||||
| @@ -224,6 +225,7 @@ SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifndef SHGEMM_BETA | ifndef SHGEMM_BETA | ||||
| SHGEMM_BETA = gemm_beta_rvv.c | SHGEMM_BETA = gemm_beta_rvv.c | ||||
| endif | endif | ||||
| endif | |||||
| SAXPBYKERNEL = axpby_vector_v2.c | SAXPBYKERNEL = axpby_vector_v2.c | ||||
| DAXPBYKERNEL = axpby_vector_v2.c | DAXPBYKERNEL = axpby_vector_v2.c | ||||
| @@ -106,3 +106,12 @@ DASUMKERNEL = dasum.c | |||||
| SROTKERNEL = srot.c | SROTKERNEL = srot.c | ||||
| DROTKERNEL = drot.c | DROTKERNEL = drot.c | ||||
| ifeq ($(BUILD_BFLOAT16), 1) | |||||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||