Browse Source

Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B

Added HFLOAT16 support for RISCV64
Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B based on HFLOAT16
The instruction sets used are ZVFH and ZFH, which need to be supported by RVV1.0

Related to issue #5279
Co-authored-by Linjin Li <linjin_li@163.com>
pull/5290/head
gkdddd 4 months ago
parent
commit
670ec6f757
46 changed files with 39881 additions and 611 deletions
  1. +3
    -0
      CMakeLists.txt
  2. +2
    -2
      Makefile.prebuild
  3. +4
    -4
      Makefile.riscv64
  4. +2
    -0
      Makefile.rule
  5. +4
    -0
      Makefile.system
  6. +2
    -0
      benchmark/gemm.c
  7. +1
    -1
      cblas.h
  8. +6
    -3
      common.h
  9. +6
    -0
      driver/level3/CMakeLists.txt
  10. +55
    -0
      driver/level3/Makefile
  11. +1
    -1
      driver/others/Makefile
  12. +22
    -0
      driver/others/parameter.c
  13. +3
    -0
      exports/Makefile
  14. +4
    -2
      exports/gensymbol
  15. +4
    -3
      exports/gensymbol.pl
  16. +2
    -0
      getarch_2nd.c
  17. +58
    -0
      install/generate.py
  18. BIN
      install/generated_test
  19. +31
    -0
      install/generated_test.c
  20. +457
    -0
      install/include/cblas.h
  21. +811
    -0
      install/include/f77blas.h
  22. +23341
    -0
      install/include/lapack.h
  23. +12888
    -0
      install/include/lapacke.h
  24. +159
    -0
      install/include/lapacke_config.h
  25. +17
    -0
      install/include/lapacke_mangling.h
  26. +612
    -0
      install/include/lapacke_utils.h
  27. +136
    -0
      install/include/openblas_config.h
  28. +4
    -0
      install/lib/cmake/openblas/OpenBLASConfig.cmake
  29. +9
    -0
      install/lib/cmake/openblas/OpenBLASConfigVersion.cmake
  30. +16
    -0
      install/lib/pkgconfig/openblas.pc
  31. BIN
      install/test_shgemm
  32. +45
    -0
      install/test_shgemm.c
  33. BIN
      install/zvl_test
  34. +22
    -0
      install/zvl_test.c
  35. +3
    -0
      interface/CMakeLists.txt
  36. +22
    -2
      interface/Makefile
  37. +55
    -0
      kernel/CMakeLists.txt
  38. +164
    -0
      kernel/Makefile.L3
  39. +9
    -0
      kernel/riscv64/KERNEL.RISCV64_ZVL128B
  40. +16
    -0
      kernel/riscv64/KERNEL.RISCV64_ZVL256B
  41. +810
    -559
      kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
  42. +33
    -33
      kernel/riscv64/shgemm_kernel_8x8_zvl128b.c
  43. +37
    -0
      kernel/setparam-ref.c
  44. +1
    -0
      lapack/CMakeLists.txt
  45. +3
    -1
      openblas_config_template.h
  46. +1
    -0
      param.h

+ 3
- 0
CMakeLists.txt View File

@@ -152,6 +152,9 @@ endif ()
if (NOT DEFINED BUILD_BFLOAT16) if (NOT DEFINED BUILD_BFLOAT16)
set (BUILD_BFLOAT16 false) set (BUILD_BFLOAT16 false)
endif () endif ()
if (NOT DEFINED BUILD_HFLOAT16)
set (BUILD_HFLOAT16 false)
endif ()
# set which float types we want to build for # set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all # if none are defined, build for all


+ 2
- 2
Makefile.prebuild View File

@@ -64,11 +64,11 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
endif endif


ifeq ($(TARGET), RISCV64_ZVL256B) ifeq ($(TARGET), RISCV64_ZVL256B)
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
endif endif


ifeq ($(TARGET), RISCV64_ZVL128B) ifeq ($(TARGET), RISCV64_ZVL128B)
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
endif endif


ifeq ($(TARGET), RISCV64_GENERIC) ifeq ($(TARGET), RISCV64_GENERIC)


+ 4
- 4
Makefile.riscv64 View File

@@ -7,12 +7,12 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
endif endif
ifeq ($(CORE), RISCV64_ZVL256B) ifeq ($(CORE), RISCV64_ZVL256B)
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
CCOMMON_OPT += -march=rv64imafdcv_zvl256b_zvfh_zfh -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
endif endif
ifeq ($(CORE), RISCV64_ZVL128B) ifeq ($(CORE), RISCV64_ZVL128B)
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
CCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
endif endif
ifeq ($(CORE), RISCV64_GENERIC) ifeq ($(CORE), RISCV64_GENERIC)
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d


+ 2
- 0
Makefile.rule View File

@@ -308,6 +308,8 @@ COMMON_PROF = -pg
# If you want to enable the experimental BFLOAT16 support # If you want to enable the experimental BFLOAT16 support
# BUILD_BFLOAT16 = 1 # BUILD_BFLOAT16 = 1


# If you want to enable the experimental HFLOAT16 support
BUILD_HFLOAT16 = 1


# Set the thread number threshold beyond which the job array for the threaded level3 BLAS # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
# will be allocated on the heap rather than the stack. (This array alone requires # will be allocated on the heap rather than the stack. (This array alone requires


+ 4
- 0
Makefile.system View File

@@ -280,6 +280,7 @@ GEMM_GEMV_FORWARD_BF16 = 1
endif endif
ifeq ($(ARCH), riscv) ifeq ($(ARCH), riscv)
GEMM_GEMV_FORWARD = 1 GEMM_GEMV_FORWARD = 1
BUILD_HFLOAT16 = 1
endif endif
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
GEMM_GEMV_FORWARD = 1 GEMM_GEMV_FORWARD = 1
@@ -1547,6 +1548,9 @@ endif
ifeq ($(BUILD_BFLOAT16), 1) ifeq ($(BUILD_BFLOAT16), 1)
CCOMMON_OPT += -DBUILD_BFLOAT16 CCOMMON_OPT += -DBUILD_BFLOAT16
endif endif
ifeq ($(BUILD_HFLOAT16), 1)
CCOMMON_OPT += -DBUILD_HFLOAT16
endif
ifeq ($(BUILD_SINGLE), 1) ifeq ($(BUILD_SINGLE), 1)
CCOMMON_OPT += -DBUILD_SINGLE=1 CCOMMON_OPT += -DBUILD_SINGLE=1
endif endif


+ 2
- 0
benchmark/gemm.c View File

@@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM BLASFUNC(dgemm) #define GEMM BLASFUNC(dgemm)
#elif defined(HALF) #elif defined(HALF)
#define GEMM BLASFUNC(sbgemm) #define GEMM BLASFUNC(sbgemm)
#elif defined(HFLOAT16)
#define GEMM BLASFUNC(shgemm)
#else #else
#define GEMM BLASFUNC(sgemm) #define GEMM BLASFUNC(sgemm)
#endif #endif


+ 1
- 1
cblas.h View File

@@ -446,7 +446,7 @@ void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum C
void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size); OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);


/*** FLOAT16 extensions */
/*** FLOAT16 extensions ***/
void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);




+ 6
- 3
common.h View File

@@ -266,9 +266,12 @@ typedef uint16_t bfloat16;
#define BFLOAT16CONVERSION 1 #define BFLOAT16CONVERSION 1
#endif #endif


#ifndef hfloat16
#include <stdint.h>
typedef uint16_t hfloat16;
#ifdef BUILD_HFLOAT16
#ifndef hfloat16
typedef _Float16 hfloat16;
#endif
#else
typedef uint16_t hfloat16;
#endif #endif


#ifdef USE64BITINT #ifdef USE64BITINT


+ 6
- 0
driver/level3/CMakeLists.txt View File

@@ -18,6 +18,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
endif () endif ()
endif () endif ()
if (BUILD_HFLOAT16)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "HFLOAT16")
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "HFLOAT16")
endif ()
endif ()
endforeach () endforeach ()


if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)


+ 55
- 0
driver/level3/Makefile View File

@@ -23,6 +23,10 @@ ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX)
endif endif


ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX)
endif

SBLASOBJS += \ SBLASOBJS += \
sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \
strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \
@@ -210,6 +214,9 @@ ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
endif endif
ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX)
endif
SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX)
@@ -355,6 +362,18 @@ sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)


shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)

shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)

shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)

sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)


@@ -562,6 +581,18 @@ sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)


shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)

shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)

shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)

sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)


@@ -2747,6 +2778,18 @@ sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)


shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)

shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)

shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)

sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)


@@ -2970,6 +3013,18 @@ sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)


shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)

shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)

shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)

sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)




+ 1
- 1
driver/others/Makefile View File

@@ -218,7 +218,7 @@ mulx.$(SUFFIX) : $(ARCH)/mulx.c
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F)


detect_riscv64.$(SUFFIX): detect_riscv64.c detect_riscv64.$(SUFFIX): detect_riscv64.c
$(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F)
$(CC) $(CFLAGS) -c -march=rv64imafdcv_zvfh_zfh $< -o $(@F)


xerbla.$(PSUFFIX) : xerbla.c xerbla.$(PSUFFIX) : xerbla.c
$(CC) $(PFLAGS) -c $< -o $(@F) $(CC) $(PFLAGS) -c $< -o $(@F)


+ 22
- 0
driver/others/parameter.c View File

@@ -67,6 +67,11 @@ BLASLONG sbgemm_p = DEFAULT_GEMM_P;
#else #else
BLASLONG sbgemm_p = SBGEMM_P; BLASLONG sbgemm_p = SBGEMM_P;
#endif #endif
#if SHGEMM_P == shgemm_p
BLASLONG shgemm_p = DEFAULT_GEMM_P;
#else
BLASLONG shgemm_p = SHGEMM_P;
#endif
#if SGEMM_P == sgemm_p #if SGEMM_P == sgemm_p
BLASLONG sgemm_p = DEFAULT_GEMM_P; BLASLONG sgemm_p = DEFAULT_GEMM_P;
#else #else
@@ -93,6 +98,11 @@ BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
#else #else
BLASLONG sbgemm_q = SBGEMM_Q; BLASLONG sbgemm_q = SBGEMM_Q;
#endif #endif
#if SHGEMM_Q == shgemm_q
BLASLONG shgemm_q = DEFAULT_GEMM_Q;
#else
BLASLONG shgemm_q = SHGEMM_Q;
#endif
#if SGEMM_Q == sgemm_q #if SGEMM_Q == sgemm_q
BLASLONG sgemm_q = DEFAULT_GEMM_Q; BLASLONG sgemm_q = DEFAULT_GEMM_Q;
#else #else
@@ -119,6 +129,11 @@ BLASLONG sbgemm_r = DEFAULT_GEMM_R;
#else #else
BLASLONG sbgemm_r = SBGEMM_R; BLASLONG sbgemm_r = SBGEMM_R;
#endif #endif
#if SHGEMM_R == shgemm_r
BLASLONG shgemm_r = DEFAULT_GEMM_R;
#else
BLASLONG shgemm_r = SHGEMM_R;
#endif
#if SGEMM_R == sgemm_r #if SGEMM_R == sgemm_r
BLASLONG sgemm_r = DEFAULT_GEMM_R; BLASLONG sgemm_r = DEFAULT_GEMM_R;
#else #else
@@ -526,6 +541,9 @@ void blas_set_parameter(void){


#ifdef BUILD_BFLOAT16 #ifdef BUILD_BFLOAT16
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
#endif
#ifdef BUILD_HFLOAT16
shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
#endif #endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
@@ -619,6 +637,7 @@ void blas_set_parameter(void){
size = BITMASK(cpuid3, 16, 0xff); size = BITMASK(cpuid3, 16, 0xff);


sbgemm_p = 192 * (size + 1); sbgemm_p = 192 * (size + 1);
shgemm_p = 192 * (size + 1);
sgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1);
dgemm_p = 96 * (size + 1); dgemm_p = 96 * (size + 1);
cgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1);
@@ -634,6 +653,9 @@ void blas_set_parameter(void){


#ifdef BUILD_BFLOAT16 #ifdef BUILD_BFLOAT16
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
#endif
#ifdef BUILD_HFLOAT16
shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
#endif #endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;


+ 3
- 0
exports/Makefile View File

@@ -39,6 +39,9 @@ endif
ifndef BUILD_BFLOAT16 ifndef BUILD_BFLOAT16
BUILD_BFLOAT16 = 0 BUILD_BFLOAT16 = 0
endif endif
ifndef BUILD_HFLOAT16
BUILD_HFLOAT16 = 0
endif
ifndef BUILD_SINGLE ifndef BUILD_SINGLE
BUILD_SINGLE = 0 BUILD_SINGLE = 0
endif endif


+ 4
- 2
exports/gensymbol View File

@@ -52,6 +52,7 @@ blasobjsz="


blasobjs="lsame xerbla" blasobjs="lsame xerbla"
bfblasobjs="sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" bfblasobjs="sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
hfblasobjs="shgemm"
cblasobjsc=" cblasobjsc="
cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k
@@ -100,6 +101,7 @@ cblasobjsz="
cblasobjs="cblas_xerbla" cblasobjs="cblas_xerbla"


bfcblasobjs="cblas_sbgemm cblas_sbgemv cblas_sbdot cblas_sbstobf16 cblas_sbdtobf16 cblas_sbf16tos cblas_dbf16tod cblas_sbgemm_batch" bfcblasobjs="cblas_sbgemm cblas_sbgemv cblas_sbdot cblas_sbstobf16 cblas_sbdtobf16 cblas_sbf16tos cblas_dbf16tod cblas_sbgemm_batch"
hfcblasobjs="cblas_shgemm"


exblasobjs=" exblasobjs="
qamax qamin qasum qaxpy qcabs1 qcopy qdot qgbmv qgemm qamax qamin qasum qaxpy qcabs1 qcopy qdot qgbmv qgemm
@@ -3816,8 +3818,8 @@ shift
p17=$9 p17=$9


if [ $p13 -eq 1 ]; then if [ $p13 -eq 1 ]; then
blasobjs="$blasobjs $bfblasobjs"
cblasobjs="$cblasobjs $bfcblasobjs"
blasobjs="$blasobjs $bfblasobjs $hfblasobjs"
cblasobjs="$cblasobjs $bfcblasobjs $hfcblasobjs"
fi fi


if [ $p14 -eq 1 ]; then if [ $p14 -eq 1 ]; then


+ 4
- 3
exports/gensymbol.pl View File

@@ -52,6 +52,7 @@


@blasobjs = (lsame, xerbla); @blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@hfblasobjs = (shgemm);
@cblasobjsc = ( @cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -97,7 +98,7 @@
@cblasobjs = ( cblas_xerbla ); @cblasobjs = ( cblas_xerbla );


@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); @bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch);
@hfcblasobjs = (cblas_shgemm);
@exblasobjs = ( @exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
qgemv,qger,qmax,qmin, qgemv,qger,qmax,qmin,
@@ -3773,8 +3774,8 @@ use File::Basename;
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");


if ($ARGV[12] == 1) { if ($ARGV[12] == 1) {
@blasobjs = (@blasobjs, @bfblasobjs);
@cblasobjs = (@cblasobjs, @bfcblasobjs);
@blasobjs = (@blasobjs, @bfblasobjs, @hfblasobjs);
@cblasobjs = (@cblasobjs, @bfcblasobjs, @hfcblasobjs);
} }
if ($ARGV[13] == 1) { if ($ARGV[13] == 1) {
@blasobjs = (@blasobjs, @blasobjss); @blasobjs = (@blasobjs, @blasobjss);


+ 2
- 0
getarch_2nd.c View File

@@ -19,6 +19,8 @@ int main(int argc, char **argv) {
if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M);
printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N);
printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M);
printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N);
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);


+ 58
- 0
install/generate.py View File

@@ -0,0 +1,58 @@
import numpy as np
import torch
# 设置矩阵尺寸
M, K, N = 31, 31, 31 # 可修改为更大规模

# 生成随机输入矩阵,类型为float16
A = np.random.randint(0, 11, size=(M, K)).astype(np.float16)
B = np.random.randint(0, 11, size=(K, N)).astype(np.float16)
A_torch = torch.tensor(A, dtype=torch.float16, device='cuda')
B_torch = torch.tensor(B, dtype=torch.float16, device='cuda')
C_torch = torch.matmul(A_torch, B_torch)
C_ref = C_torch.cpu().numpy().astype(np.float32)

def format_array_c(name, array, c_type="hfloat16"):
flat = array.flatten()
elements = ", ".join(f"{x:.5f}" for x in flat)
return f"{c_type} {name}[{len(flat)}] = {{ {elements} }};\n"

def format_array_c_float(name, array):
flat = array.flatten()
elements = ", ".join(f"{x:.5f}" for x in flat)
return f"float {name}[{len(flat)}] = {{ {elements} }};\n"

# 写入C文件
with open("generated_test.c", "w") as f:
f.write('#include <stdio.h>\n')
f.write('#include <stdlib.h>\n')
f.write('#include <string.h>\n')
f.write('#include <cblas.h>\n\n')

f.write(f"const int M = {M}, K = {K}, N = {N};\n")
f.write("const float alpha = 1.0f, beta = 0.0f;\n\n")

f.write(format_array_c("A", A))
f.write(format_array_c("B", B))
f.write(f"float C[{M*N}] = {{ 0 }};\n\n")

f.write("int main() {\n")
f.write(" cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,\n")
f.write(" M, N, K,\n")
f.write(" alpha,\n")
f.write(" A, K,\n")
f.write(" B, N,\n")
f.write(" beta,\n")
f.write(" C, N);\n\n")

f.write(' printf("Result C = A * B:\\n");\n')
f.write(" for (int i = 0; i < M * N; i++) {\n")
f.write(" printf(\"%.5f \", C[i]);\n")
f.write(" if ((i + 1) % N == 0) printf(\"\\n\");\n")
f.write(" }\n")
f.write(" return 0;\n")
f.write("}\n\n")

f.write("// Reference result computed in Python:\n")
c_ref_flat = ", ".join(f"{x:.5f}" for x in C_ref.flatten())
f.write(f"// C_ref = {{ {c_ref_flat} }}\n")


BIN
install/generated_test View File


+ 31
- 0
install/generated_test.c
File diff suppressed because it is too large
View File


+ 457
- 0
install/include/cblas.h View File

@@ -0,0 +1,457 @@
#ifndef CBLAS_H
#define CBLAS_H

#include <stddef.h>
#include "openblas_config.h"

#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */

/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
int openblas_set_num_threads_local(int num_threads);

/*Get the number of threads on runtime.*/
int openblas_get_num_threads(void);

/*Get the number of physical processors (cores).*/
int openblas_get_num_procs(void);

/*Get the build configure on runtime.*/
char* openblas_get_config(void);

/*Get the CPU corename on runtime.*/
char* openblas_get_corename(void);

/*Set the threading backend to a custom callback.*/
typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
void openblas_set_threads_callback_function(openblas_threads_callback callback);

#ifdef OPENBLAS_OS_LINUX
/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
/* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
#endif

/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2


/*
* Since all of GotoBlas was written without const,
* we disable it at build time.
*/
#ifndef OPENBLAS_CONST
# define OPENBLAS_CONST const
#endif


#define CBLAS_INDEX size_t

typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
typedef CBLAS_ORDER CBLAS_LAYOUT;
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);

openblas_complex_float cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);
openblas_complex_float cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);
openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);
openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);

void cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);
void cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);
void cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);
void cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);

float cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);

CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_cswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);

void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_crotg(void *a, void *b, float *c, void *s);
void cblas_zrotg(void *a, void *b, double *c, void *s);


void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);

void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P);

void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);
void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX);
void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX);
void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, void *X, OPENBLAS_CONST blasint incX);
void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, void *X, OPENBLAS_CONST blasint incX);

void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);
void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);
void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);

void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);

void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);
void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda);
void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda);

void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X,
OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,
OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX,
OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX,
OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);

void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);

void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);


void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);

void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);

void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);

void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A,
OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A,
OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);


void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap,
OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap,
OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);

void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap);
void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap);

void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A);
void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X,OPENBLAS_CONST blasint incX, void *A);

void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A);
void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A);
void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap);
void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap);

void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);

void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);

void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);

void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);

void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);

void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);

/*** BLAS extensions ***/

void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);

void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);

void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);

void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);

void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a,
OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb);
void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a,
OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb);
void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a,
OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);

void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);

void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
double *c, OPENBLAS_CONST blasint cldc);
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc);

void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

/*** BFLOAT16 and INT8 extensions ***/
/* convert float array to BFLOAT16 array by rounding */
void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
/* convert double array to BFLOAT16 array by rounding */
void cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
/* convert BFLOAT16 array to float array */
void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout);
/* convert BFLOAT16 array to double array */
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
/* dot production of BFLOAT16 input arrays, and output as float */
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);

void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);

/*** FLOAT16 extensions ***/
void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif

+ 811
- 0
install/include/f77blas.h View File

@@ -0,0 +1,811 @@
#ifndef OPENBLAS_F77BLAS_H
#define OPENBLAS_F77BLAS_H
#include "openblas_config.h"
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#ifndef ASSEMBLER

#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */

int BLASFUNC(xerbla)(char *, blasint *info, blasint);

void openblas_set_num_threads_(int *);

/*Set the threading backend to a custom callback.*/
typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
extern openblas_threads_callback openblas_threads_callback_;

FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);

double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *);
double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *);
xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *);

float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *);
void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *);

#ifdef RETURN_BY_STRUCT
typedef struct {
float r, i;
} myccomplex_t;

typedef struct {
double r, i;
} myzcomplex_t;

typedef struct {
xdouble r, i;
} myxcomplex_t;

myccomplex_t BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *);
myccomplex_t BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *);
myzcomplex_t BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *);
myzcomplex_t BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *);
myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);

#elif defined RETURN_BY_STACK
void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *);
void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
#else
openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *);
openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *);
openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *);
openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *);
openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
#endif

void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
void BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
void BLASFUNC(xaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(caxpyc)(blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(zaxpyc)(blasint *, double *, double *, blasint *, double *, blasint *);
void BLASFUNC(xaxpyc)(blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(scopy) (blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(dcopy) (blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(qcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(ccopy) (blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zcopy) (blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(sswap) (blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(dswap) (blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(cswap) (blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zswap) (blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(sasum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scasum)(blasint *, float *, blasint *);
double BLASFUNC(dasum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);

blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(icamax)(blasint *, float *, blasint *);
blasint BLASFUNC(izamax)(blasint *, double *, blasint *);
blasint BLASFUNC(ixamax)(blasint *, xdouble *, blasint *);

blasint BLASFUNC(ismax) (blasint *, float *, blasint *);
blasint BLASFUNC(idmax) (blasint *, double *, blasint *);
blasint BLASFUNC(iqmax) (blasint *, xdouble *, blasint *);
blasint BLASFUNC(icmax) (blasint *, float *, blasint *);
blasint BLASFUNC(izmax) (blasint *, double *, blasint *);
blasint BLASFUNC(ixmax) (blasint *, xdouble *, blasint *);

blasint BLASFUNC(isamin)(blasint *, float *, blasint *);
blasint BLASFUNC(idamin)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamin)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(icamin)(blasint *, float *, blasint *);
blasint BLASFUNC(izamin)(blasint *, double *, blasint *);
blasint BLASFUNC(ixamin)(blasint *, xdouble *, blasint *);

blasint BLASFUNC(ismin)(blasint *, float *, blasint *);
blasint BLASFUNC(idmin)(blasint *, double *, blasint *);
blasint BLASFUNC(iqmin)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(icmin)(blasint *, float *, blasint *);
blasint BLASFUNC(izmin)(blasint *, double *, blasint *);
blasint BLASFUNC(ixmin)(blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(samax) (blasint *, float *, blasint *);
double BLASFUNC(damax) (blasint *, double *, blasint *);
xdouble BLASFUNC(qamax) (blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(scamax)(blasint *, float *, blasint *);
double BLASFUNC(dzamax)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxamax)(blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(samin) (blasint *, float *, blasint *);
double BLASFUNC(damin) (blasint *, double *, blasint *);
xdouble BLASFUNC(qamin) (blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(scamin)(blasint *, float *, blasint *);
double BLASFUNC(dzamin)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxamin)(blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(smax) (blasint *, float *, blasint *);
double BLASFUNC(dmax) (blasint *, double *, blasint *);
xdouble BLASFUNC(qmax) (blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(scmax) (blasint *, float *, blasint *);
double BLASFUNC(dzmax) (blasint *, double *, blasint *);
xdouble BLASFUNC(qxmax) (blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(smin) (blasint *, float *, blasint *);
double BLASFUNC(dmin) (blasint *, double *, blasint *);
xdouble BLASFUNC(qmin) (blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(scmin) (blasint *, float *, blasint *);
double BLASFUNC(dzmin) (blasint *, double *, blasint *);
xdouble BLASFUNC(qxmin) (blasint *, xdouble *, blasint *);

void BLASFUNC(sscal) (blasint *, float *, float *, blasint *);
void BLASFUNC(dscal) (blasint *, double *, double *, blasint *);
void BLASFUNC(qscal) (blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(cscal) (blasint *, float *, float *, blasint *);
void BLASFUNC(zscal) (blasint *, double *, double *, blasint *);
void BLASFUNC(xscal) (blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(csscal)(blasint *, float *, float *, blasint *);
void BLASFUNC(zdscal)(blasint *, double *, double *, blasint *);
void BLASFUNC(xqscal)(blasint *, xdouble *, xdouble *, blasint *);

FLOATRET BLASFUNC(snrm2) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scnrm2)(blasint *, float *, blasint *);

double BLASFUNC(dnrm2) (blasint *, double *, blasint *);
xdouble BLASFUNC(qnrm2) (blasint *, xdouble *, blasint *);
double BLASFUNC(dznrm2)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxnrm2)(blasint *, xdouble *, blasint *);

void BLASFUNC(srot) (blasint *, float *, blasint *, float *, blasint *, float *, float *);
void BLASFUNC(drot) (blasint *, double *, blasint *, double *, blasint *, double *, double *);
void BLASFUNC(qrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *);
void BLASFUNC(csrot) (blasint *, float *, blasint *, float *, blasint *, float *, float *);
void BLASFUNC(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *);
void BLASFUNC(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *);

void BLASFUNC(srotg) (float *, float *, float *, float *);
void BLASFUNC(drotg) (double *, double *, double *, double *);
void BLASFUNC(qrotg) (xdouble *, xdouble *, xdouble *, xdouble *);
void BLASFUNC(crotg) (float *, float *, float *, float *);
void BLASFUNC(zrotg) (double *, double *, double *, double *);
void BLASFUNC(xrotg) (xdouble *, xdouble *, xdouble *, xdouble *);

void BLASFUNC(srotmg)(float *, float *, float *, float *, float *);
void BLASFUNC(drotmg)(double *, double *, double *, double *, double *);

void BLASFUNC(srotm) (blasint *, float *, blasint *, float *, blasint *, float *);
void BLASFUNC(drotm) (blasint *, double *, blasint *, double *, blasint *, double *);
void BLASFUNC(qrotm) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *);

/* Level 2 routines */

void BLASFUNC(sger)(blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, blasint *);
void BLASFUNC(dger)(blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, blasint *);
void BLASFUNC(qger)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(cgeru)(blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, blasint *);
void BLASFUNC(cgerc)(blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, blasint *);
void BLASFUNC(zgeru)(blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, blasint *);
void BLASFUNC(zgerc)(blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, blasint *);
void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *,
bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(strsv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);
void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);

void BLASFUNC(strmv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);
void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *,
float *, blasint *);
void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *,
double *, blasint *);
void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
xdouble *, blasint *);

void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float *, float *, blasint *);
void BLASFUNC(dtpsv) (char *, char *, char *, blasint *, double *, double *, blasint *);
void BLASFUNC(qtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(ctpsv) (char *, char *, char *, blasint *, float *, float *, blasint *);
void BLASFUNC(ztpsv) (char *, char *, char *, blasint *, double *, double *, blasint *);
void BLASFUNC(xtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(stpmv) (char *, char *, char *, blasint *, float *, float *, blasint *);
void BLASFUNC(dtpmv) (char *, char *, char *, blasint *, double *, double *, blasint *);
void BLASFUNC(qtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(ctpmv) (char *, char *, char *, blasint *, float *, float *, blasint *);
void BLASFUNC(ztpmv) (char *, char *, char *, blasint *, double *, double *, blasint *);
void BLASFUNC(xtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(stbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(dtbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(qtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(ctbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(ztbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(stbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(dtbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(qtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(sspmv) (char *, blasint *, float *, float *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dspmv) (char *, blasint *, double *, double *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(cspmv) (char *, blasint *, float *, float *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zspmv) (char *, blasint *, double *, double *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xspmv) (char *, blasint *, xdouble *, xdouble *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(ssyr) (char *, blasint *, float *, float *, blasint *,
float *, blasint *);
void BLASFUNC(dsyr) (char *, blasint *, double *, double *, blasint *,
double *, blasint *);
void BLASFUNC(qsyr) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *);
void BLASFUNC(csyr) (char *, blasint *, float *, float *, blasint *,
float *, blasint *);
void BLASFUNC(zsyr) (char *, blasint *, double *, double *, blasint *,
double *, blasint *);
void BLASFUNC(xsyr) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *);

void BLASFUNC(ssyr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(dsyr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(qsyr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(csyr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zsyr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xsyr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(sspr) (char *, blasint *, float *, float *, blasint *,
float *);
void BLASFUNC(dspr) (char *, blasint *, double *, double *, blasint *,
double *);
void BLASFUNC(qspr) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *);
void BLASFUNC(cspr) (char *, blasint *, float *, float *, blasint *,
float *);
void BLASFUNC(zspr) (char *, blasint *, double *, double *, blasint *,
double *);
void BLASFUNC(xspr) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *);

void BLASFUNC(sspr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *);
void BLASFUNC(dspr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *);
void BLASFUNC(qspr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *);
void BLASFUNC(cspr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *);
void BLASFUNC(zspr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *);
void BLASFUNC(xspr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *);

void BLASFUNC(cher) (char *, blasint *, float *, float *, blasint *,
float *, blasint *);
void BLASFUNC(zher) (char *, blasint *, double *, double *, blasint *,
double *, blasint *);
void BLASFUNC(xher) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *);

void BLASFUNC(chpr) (char *, blasint *, float *, float *, blasint *, float *);
void BLASFUNC(zhpr) (char *, blasint *, double *, double *, blasint *, double *);
void BLASFUNC(xhpr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *);

void BLASFUNC(cher2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zher2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xher2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(chpr2) (char *, blasint *, float *,
float *, blasint *, float *, blasint *, float *);
void BLASFUNC(zhpr2) (char *, blasint *, double *,
double *, blasint *, double *, blasint *, double *);
void BLASFUNC(xhpr2) (char *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *);

void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(chpmv) (char *, blasint *, float *, float *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zhpmv) (char *, blasint *, double *, double *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xhpmv) (char *, blasint *, xdouble *, xdouble *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

int BLASFUNC(snorm)(char *, blasint *, blasint *, float *, blasint *);
int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *);
int BLASFUNC(cnorm)(char *, blasint *, blasint *, float *, blasint *);
int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *);

void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

/* Level 3 routines */

void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
hfloat16 *, blasint *, hfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(cgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(cgemm3m)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);

int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *,
float *, float *, blasint *);
int BLASFUNC(dge2mm)(char *, char *, char *, blasint *, blasint *,
double *, double *, blasint *, double *, blasint *,
double *, double *, blasint *);
int BLASFUNC(cge2mm)(char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *,
float *, float *, blasint *);
int BLASFUNC(zge2mm)(char *, char *, char *, blasint *, blasint *,
double *, double *, blasint *, double *, blasint *,
double *, double *, blasint *);

void BLASFUNC(strsm)(char *, char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *);
void BLASFUNC(dtrsm)(char *, char *, char *, char *, blasint *, blasint *,
double *, double *, blasint *, double *, blasint *);
void BLASFUNC(qtrsm)(char *, char *, char *, char *, blasint *, blasint *,
xdouble *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(ctrsm)(char *, char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *);
void BLASFUNC(ztrsm)(char *, char *, char *, char *, blasint *, blasint *,
double *, double *, blasint *, double *, blasint *);
void BLASFUNC(xtrsm)(char *, char *, char *, char *, blasint *, blasint *,
xdouble *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(strmm)(char *, char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *);
void BLASFUNC(dtrmm)(char *, char *, char *, char *, blasint *, blasint *,
double *, double *, blasint *, double *, blasint *);
void BLASFUNC(qtrmm)(char *, char *, char *, char *, blasint *, blasint *,
xdouble *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(ctrmm)(char *, char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *);
void BLASFUNC(ztrmm)(char *, char *, char *, char *, blasint *, blasint *,
double *, double *, blasint *, double *, blasint *);
void BLASFUNC(xtrmm)(char *, char *, char *, char *, blasint *, blasint *,
xdouble *, xdouble *, blasint *, xdouble *, blasint *);

void BLASFUNC(ssymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(qsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(csymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(csymm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zsymm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xsymm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(ssyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, float *, blasint *);
void BLASFUNC(dsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, double *, blasint *);
void BLASFUNC(qsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, xdouble *, blasint *);
void BLASFUNC(csyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, float *, blasint *);
void BLASFUNC(zsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, double *, blasint *);
void BLASFUNC(xsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, xdouble *, blasint *);

void BLASFUNC(ssyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double*, blasint *, double *, double *, blasint *);
void BLASFUNC(qsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble*, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(csyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double*, blasint *, double *, double *, blasint *);
void BLASFUNC(xsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble*, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(chemm)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zhemm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xhemm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(chemm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zhemm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *, double *, double *, blasint *);
void BLASFUNC(xhemm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, xdouble *, blasint *);

void BLASFUNC(cherk)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, float *, blasint *);
void BLASFUNC(zherk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double *, double *, blasint *);
void BLASFUNC(xherk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, xdouble *, blasint *);

void BLASFUNC(cher2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zher2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
double*, blasint *, double *, double *, blasint *);
void BLASFUNC(xher2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble*, blasint *, xdouble *, xdouble *, blasint *);

int BLASFUNC(cher2m)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
int BLASFUNC(zher2m)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *,
double*, blasint *, double *, double *, blasint *);
int BLASFUNC(xher2m)(char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble*, blasint *, xdouble *, xdouble *, blasint *);

int BLASFUNC(sgemt)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *);
int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *);
int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *);
int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
double *, blasint *);

int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, float *, blasint *, float *, blasint *);
int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *,
double *, blasint *, double*, double *, blasint *, double*, blasint *);
int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, float *, blasint *, float *, blasint *);
int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *,
double *, blasint *, double*, double *, blasint *, double*, blasint *);

int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, float *, blasint *, float *, blasint *);
int BLASFUNC(dgems)(char *, char *, blasint *, blasint *, double *,
double *, blasint *, double*, double *, blasint *, double*, blasint *);
int BLASFUNC(cgems)(char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, float *, blasint *, float *, blasint *);
int BLASFUNC(zgems)(char *, char *, blasint *, blasint *, double *,
double *, blasint *, double*, double *, blasint *, double*, blasint *);

int BLASFUNC(sgemc)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *);
int BLASFUNC(dgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *);
int BLASFUNC(qgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
int BLASFUNC(cgemc)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *);
int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *);
int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

/* Lapack routines */

int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *);
int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
int BLASFUNC(cgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *);
int BLASFUNC(zgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
int BLASFUNC(xgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);

int BLASFUNC(sgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *);
int BLASFUNC(dgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
int BLASFUNC(qgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
int BLASFUNC(cgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *);
int BLASFUNC(zgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
int BLASFUNC(xgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);

int BLASFUNC(slaswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *);
int BLASFUNC(dlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *);
int BLASFUNC(qlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *);
int BLASFUNC(claswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *);
int BLASFUNC(zlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *);
int BLASFUNC(xlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *);

int BLASFUNC(sgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(sgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
int BLASFUNC(qgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);

int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotf2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotf2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(spotrf)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotrf)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(clauu2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zlauu2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(slauum)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dlauum)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qlauum)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(clauum)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zlauum)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xlauum)(char *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(strti2)(char *, char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dtrti2)(char *, char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(ctrti2)(char *, char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(ztrti2)(char *, char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *);

int BLASFUNC(strtri)(char *, char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dtrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(ctrtri)(char *, char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);


FLOATRET BLASFUNC(slamch)(char *);
double BLASFUNC(dlamch)(char *);
xdouble BLASFUNC(qlamch)(char *);

FLOATRET BLASFUNC(slamc3)(float *, float *);
double BLASFUNC(dlamc3)(double *, double *);
xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);

/* BLAS extensions */

void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);

void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
void BLASFUNC(comatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(zomatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);

void BLASFUNC(simatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);

void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);


#ifdef __cplusplus
}

#endif /* __cplusplus */

#endif
#endif

+ 23341
- 0
install/include/lapack.h
File diff suppressed because it is too large
View File


+ 12888
- 0
install/include/lapacke.h
File diff suppressed because it is too large
View File


+ 159
- 0
install/include/lapacke_config.h View File

@@ -0,0 +1,159 @@
/*****************************************************************************
Copyright (c) 2010, Intel Corp.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************
* Contents: Native C interface to LAPACK
* Author: Intel Corporation
*****************************************************************************/

#ifndef _LAPACKE_CONFIG_H_
#define _LAPACKE_CONFIG_H_

#ifdef __cplusplus
#if defined(LAPACK_COMPLEX_CPP)
#include <complex>
#endif
extern "C" {
#endif /* __cplusplus */

#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>

#ifndef lapack_int
#if defined(LAPACK_ILP64)
#define lapack_int int64_t
#else
#define lapack_int int32_t
#endif
#endif

/*
* Integer format string
*/
#ifndef LAPACK_IFMT
#if defined(LAPACK_ILP64)
#define LAPACK_IFMT PRId64
#else
#define LAPACK_IFMT PRId32
#endif
#endif

#ifndef lapack_logical
#define lapack_logical lapack_int
#endif

#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER)
#define LAPACK_COMPLEX_STRUCTURE
#define LAPACK_GLOBAL(lcname,UCNAME) lcname
#define NOCHANGE
#endif

#ifndef LAPACK_COMPLEX_CUSTOM
#if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER)
#if defined(LAPACK_COMPLEX_CPP)
#include <complex>
#define lapack_complex_float std::complex<float>
#define lapack_complex_double std::complex<double>
#define lapack_complex_float_real(z) ((z).real())
#define lapack_complex_float_imag(z) ((z).imag())
#define lapack_complex_double_real(z) ((z).real())
#define lapack_complex_double_imag(z) ((z).imag())
#define _CRT_USE_C_COMPLEX_H
#else
#include <complex.h>
#define LAPACK_COMPLEX_CUSTOM
#define lapack_complex_float _Fcomplex
#define lapack_complex_double _Dcomplex
#define lapack_complex_float_real(z) (creal(z))
#define lapack_complex_float_imag(z) (cimag(z))
#define lapack_complex_double_real(z) (creal(z))
#define lapack_complex_double_imag(z) (cimag(z))
#endif
#else

#if defined(LAPACK_COMPLEX_STRUCTURE)

typedef struct { float real, imag; } _lapack_complex_float;
typedef struct { double real, imag; } _lapack_complex_double;
#define lapack_complex_float _lapack_complex_float
#define lapack_complex_double _lapack_complex_double
#define lapack_complex_float_real(z) ((z).real)
#define lapack_complex_float_imag(z) ((z).imag)
#define lapack_complex_double_real(z) ((z).real)
#define lapack_complex_double_imag(z) ((z).imag)

#elif defined(LAPACK_COMPLEX_C99)

#include <complex.h>
#define lapack_complex_float float _Complex
#define lapack_complex_double double _Complex
#define lapack_complex_float_real(z) (creal(z))
#define lapack_complex_float_imag(z) (cimag(z))
#define lapack_complex_double_real(z) (creal(z))
#define lapack_complex_double_imag(z) (cimag(z))

#elif defined(LAPACK_COMPLEX_CPP)

#define lapack_complex_float std::complex<float>
#define lapack_complex_double std::complex<double>
#define lapack_complex_float_real(z) ((z).real())
#define lapack_complex_float_imag(z) ((z).imag())
#define lapack_complex_double_real(z) ((z).real())
#define lapack_complex_double_imag(z) ((z).imag())

#else

#include <complex.h>
#define lapack_complex_float float _Complex
#define lapack_complex_double double _Complex
#define lapack_complex_float_real(z) (creal(z))
#define lapack_complex_float_imag(z) (cimag(z))
#define lapack_complex_double_real(z) (creal(z))
#define lapack_complex_double_imag(z) (cimag(z))

#endif
#endif

lapack_complex_float lapack_make_complex_float( float re, float im );
lapack_complex_double lapack_make_complex_double( double re, double im );

#endif

#ifndef LAPACK_malloc
#define LAPACK_malloc( size ) malloc( size )
#endif

#ifndef LAPACK_free
#define LAPACK_free( p ) free( p )
#endif

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* _LAPACKE_CONFIG_H_ */

+ 17
- 0
install/include/lapacke_mangling.h View File

@@ -0,0 +1,17 @@
#ifndef LAPACK_HEADER_INCLUDED
#define LAPACK_HEADER_INCLUDED

#ifndef LAPACK_GLOBAL
#if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
#define LAPACK_GLOBAL(lcname,UCNAME) lcname##_
#elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
#define LAPACK_GLOBAL(lcname,UCNAME) UCNAME
#elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
#define LAPACK_GLOBAL(lcname,UCNAME) lcname
#else
#define LAPACK_GLOBAL(lcname,UCNAME) lcname##_
#endif
#endif

#endif


+ 612
- 0
install/include/lapacke_utils.h View File

@@ -0,0 +1,612 @@
/*****************************************************************************
Copyright (c) 2014, Intel Corp.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************
* Contents: Native C interface to LAPACK utility functions
* Author: Intel Corporation
*****************************************************************************/

#ifndef _LAPACKE_UTILS_H_
#define _LAPACKE_UTILS_H_

#include "lapacke.h"

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#ifndef ABS
#define ABS(x) (((x) < 0) ? -(x) : (x))
#endif
#ifndef MAX
#define MAX(x,y) (((x) > (y)) ? (x) : (y))
#endif
#ifndef MIN
#define MIN(x,y) (((x) < (y)) ? (x) : (y))
#endif
#ifndef MAX3
#define MAX3(x,y,z) (((x) > MAX(y,z)) ? (x) : MAX(y,z))
#endif
#ifndef MIN3
#define MIN3(x,y,z) (((x) < MIN(y,z)) ? (x) : MIN(y,z))
#endif

#define IS_S_NONZERO(x) ( (x) < 0 || (x) > 0 )
#define IS_D_NONZERO(x) ( (x) < 0 || (x) > 0 )
#define IS_C_NONZERO(x) ( IS_S_NONZERO(*((float*)&x)) || \
IS_S_NONZERO(*(((float*)&x)+1)) )
#define IS_Z_NONZERO(x) ( IS_D_NONZERO(*((double*)&x)) || \
IS_D_NONZERO(*(((double*)&x)+1)) )

/* Error handler */
void LAPACKE_xerbla( const char *name, lapack_int info );

/* Compare two chars (case-insensitive) */
lapack_logical LAPACKE_lsame( char ca, char cb )
#if defined __GNUC__
__attribute__((const))
#endif
;

/* Functions to convert column-major to row-major 2d arrays and vice versa. */
void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
lapack_int kl, lapack_int ku,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_cge_trans( int matrix_layout, lapack_int m, lapack_int n,
const lapack_complex_float* in, lapack_int ldin,
lapack_complex_float* out, lapack_int ldout );
void LAPACKE_cgg_trans( int matrix_layout, lapack_int m, lapack_int n,
const lapack_complex_float* in, lapack_int ldin,
lapack_complex_float* out, lapack_int ldout );
void LAPACKE_chb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_che_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_chp_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_float *in,
lapack_complex_float *out );
void LAPACKE_chs_trans( int matrix_layout, lapack_int n,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_cpb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_cpf_trans( int matrix_layout, char transr, char uplo,
lapack_int n, const lapack_complex_float *in,
lapack_complex_float *out );
void LAPACKE_cpo_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_cpp_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_float *in,
lapack_complex_float *out );
void LAPACKE_csp_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_float *in,
lapack_complex_float *out );
void LAPACKE_csy_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_ctb_trans( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_ctf_trans( int matrix_layout, char transr, char uplo, char diag,
lapack_int n, const lapack_complex_float *in,
lapack_complex_float *out );
void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag,
lapack_int n, const lapack_complex_float *in,
lapack_complex_float *out );
void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );
void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const lapack_complex_float *in, lapack_int ldin,
lapack_complex_float *out, lapack_int ldout );

void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
lapack_int kl, lapack_int ku,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dge_trans( int matrix_layout, lapack_int m, lapack_int n,
const double* in, lapack_int ldin,
double* out, lapack_int ldout );
void LAPACKE_dgg_trans( int matrix_layout, lapack_int m, lapack_int n,
const double* in, lapack_int ldin,
double* out, lapack_int ldout );
void LAPACKE_dhs_trans( int matrix_layout, lapack_int n,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dpb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dpf_trans( int matrix_layout, char transr, char uplo,
lapack_int n, const double *in,
double *out );
void LAPACKE_dpo_trans( int matrix_layout, char uplo, lapack_int n,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dpp_trans( int matrix_layout, char uplo, lapack_int n,
const double *in,
double *out );
void LAPACKE_dsb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dsp_trans( int matrix_layout, char uplo, lapack_int n,
const double *in,
double *out );
void LAPACKE_dsy_trans( int matrix_layout, char uplo, lapack_int n,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dtb_trans( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dtf_trans( int matrix_layout, char transr, char uplo, char diag,
lapack_int n, const double *in,
double *out );
void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag,
lapack_int n, const double *in,
double *out );
void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );
void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const double *in, lapack_int ldin,
double *out, lapack_int ldout );

void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
lapack_int kl, lapack_int ku,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_sge_trans( int matrix_layout, lapack_int m, lapack_int n,
const float* in, lapack_int ldin,
float* out, lapack_int ldout );
void LAPACKE_sgg_trans( int matrix_layout, lapack_int m, lapack_int n,
const float* in, lapack_int ldin,
float* out, lapack_int ldout );
void LAPACKE_shs_trans( int matrix_layout, lapack_int n,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_spb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_spf_trans( int matrix_layout, char transr, char uplo,
lapack_int n, const float *in,
float *out );
void LAPACKE_spo_trans( int matrix_layout, char uplo, lapack_int n,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_spp_trans( int matrix_layout, char uplo, lapack_int n,
const float *in,
float *out );
void LAPACKE_ssb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_ssp_trans( int matrix_layout, char uplo, lapack_int n,
const float *in,
float *out );
void LAPACKE_ssy_trans( int matrix_layout, char uplo, lapack_int n,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_stb_trans( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_stf_trans( int matrix_layout, char transr, char uplo, char diag,
lapack_int n, const float *in,
float *out );
void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag,
lapack_int n, const float *in,
float *out );
void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );
void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const float *in, lapack_int ldin,
float *out, lapack_int ldout );

void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
lapack_int kl, lapack_int ku,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_zge_trans( int matrix_layout, lapack_int m, lapack_int n,
const lapack_complex_double* in, lapack_int ldin,
lapack_complex_double* out, lapack_int ldout );
void LAPACKE_zgg_trans( int matrix_layout, lapack_int m, lapack_int n,
const lapack_complex_double* in, lapack_int ldin,
lapack_complex_double* out, lapack_int ldout );
void LAPACKE_zhb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_zhe_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_zhp_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_double *in,
lapack_complex_double *out );
void LAPACKE_zhs_trans( int matrix_layout, lapack_int n,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_zpb_trans( int matrix_layout, char uplo, lapack_int n,
lapack_int kd,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_zpf_trans( int matrix_layout, char transr, char uplo,
lapack_int n, const lapack_complex_double *in,
lapack_complex_double *out );
void LAPACKE_zpo_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_zpp_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_double *in,
lapack_complex_double *out );
void LAPACKE_zsp_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_double *in,
lapack_complex_double *out );
void LAPACKE_zsy_trans( int matrix_layout, char uplo, lapack_int n,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_ztb_trans( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_ztf_trans( int matrix_layout, char transr, char uplo, char diag,
lapack_int n, const lapack_complex_double *in,
lapack_complex_double *out );
void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag,
lapack_int n, const lapack_complex_double *in,
lapack_complex_double *out );
void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );
void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const lapack_complex_double *in, lapack_int ldin,
lapack_complex_double *out, lapack_int ldout );

/* NaN checkers */
#define LAPACK_SISNAN( x ) ( x != x )
#define LAPACK_DISNAN( x ) ( x != x )
#define LAPACK_CISNAN( x ) ( LAPACK_SISNAN(*((float*) &x)) || \
LAPACK_SISNAN(*(((float*) &x)+1)) )
#define LAPACK_ZISNAN( x ) ( LAPACK_DISNAN(*((double*)&x)) || \
LAPACK_DISNAN(*(((double*)&x)+1)) )

/* NaN checkers for vectors */
lapack_logical LAPACKE_c_nancheck( lapack_int n,
const lapack_complex_float *x,
lapack_int incx );
lapack_logical LAPACKE_d_nancheck( lapack_int n,
const double *x,
lapack_int incx );
lapack_logical LAPACKE_s_nancheck( lapack_int n,
const float *x,
lapack_int incx );
lapack_logical LAPACKE_z_nancheck( lapack_int n,
const lapack_complex_double *x,
lapack_int incx );
/* NaN checkers for matrices */
lapack_logical LAPACKE_cgb_nancheck( int matrix_layout, lapack_int m,
lapack_int n, lapack_int kl,
lapack_int ku,
const lapack_complex_float *ab,
lapack_int ldab );
lapack_logical LAPACKE_cge_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_cgg_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_cgt_nancheck( lapack_int n,
const lapack_complex_float *dl,
const lapack_complex_float *d,
const lapack_complex_float *du );
lapack_logical LAPACKE_chb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const lapack_complex_float* ab,
lapack_int ldab );
lapack_logical LAPACKE_che_nancheck( int matrix_layout, char uplo,
lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_chp_nancheck( lapack_int n,
const lapack_complex_float *ap );
lapack_logical LAPACKE_chs_nancheck( int matrix_layout, lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_cpb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const lapack_complex_float* ab,
lapack_int ldab );
lapack_logical LAPACKE_cpf_nancheck( lapack_int n,
const lapack_complex_float *a );
lapack_logical LAPACKE_cpo_nancheck( int matrix_layout, char uplo,
lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_cpp_nancheck( lapack_int n,
const lapack_complex_float *ap );
lapack_logical LAPACKE_cpt_nancheck( lapack_int n,
const float *d,
const lapack_complex_float *e );
lapack_logical LAPACKE_csp_nancheck( lapack_int n,
const lapack_complex_float *ap );
lapack_logical LAPACKE_cst_nancheck( lapack_int n,
const lapack_complex_float *d,
const lapack_complex_float *e );
lapack_logical LAPACKE_csy_nancheck( int matrix_layout, char uplo,
lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_ctb_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const lapack_complex_float* ab,
lapack_int ldab );
lapack_logical LAPACKE_ctf_nancheck( int matrix_layout, char transr,
char uplo, char diag,
lapack_int n,
const lapack_complex_float *a );
lapack_logical LAPACKE_ctp_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const lapack_complex_float *ap );
lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const lapack_complex_float *a,
lapack_int lda );
lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const lapack_complex_float *a,
lapack_int lda );

lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m,
lapack_int n, lapack_int kl,
lapack_int ku,
const double *ab,
lapack_int ldab );
lapack_logical LAPACKE_dge_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const double *a,
lapack_int lda );
lapack_logical LAPACKE_dgg_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const double *a,
lapack_int lda );
lapack_logical LAPACKE_dgt_nancheck( lapack_int n,
const double *dl,
const double *d,
const double *du );
lapack_logical LAPACKE_dhs_nancheck( int matrix_layout, lapack_int n,
const double *a,
lapack_int lda );
lapack_logical LAPACKE_dpb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const double* ab,
lapack_int ldab );
lapack_logical LAPACKE_dpf_nancheck( lapack_int n,
const double *a );
lapack_logical LAPACKE_dpo_nancheck( int matrix_layout, char uplo,
lapack_int n,
const double *a,
lapack_int lda );
lapack_logical LAPACKE_dpp_nancheck( lapack_int n,
const double *ap );
lapack_logical LAPACKE_dpt_nancheck( lapack_int n,
const double *d,
const double *e );
lapack_logical LAPACKE_dsb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const double* ab,
lapack_int ldab );
lapack_logical LAPACKE_dsp_nancheck( lapack_int n,
const double *ap );
lapack_logical LAPACKE_dst_nancheck( lapack_int n,
const double *d,
const double *e );
lapack_logical LAPACKE_dsy_nancheck( int matrix_layout, char uplo,
lapack_int n,
const double *a,
lapack_int lda );
lapack_logical LAPACKE_dtb_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const double* ab,
lapack_int ldab );
lapack_logical LAPACKE_dtf_nancheck( int matrix_layout, char transr,
char uplo, char diag,
lapack_int n,
const double *a );
lapack_logical LAPACKE_dtp_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const double *ap );
lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const double *a,
lapack_int lda );
lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const double *a, lapack_int lda );

lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m,
lapack_int n, lapack_int kl,
lapack_int ku,
const float *ab,
lapack_int ldab );
lapack_logical LAPACKE_sge_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const float *a,
lapack_int lda );
lapack_logical LAPACKE_sgg_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const float *a,
lapack_int lda );
lapack_logical LAPACKE_sgt_nancheck( lapack_int n,
const float *dl,
const float *d,
const float *du );
lapack_logical LAPACKE_shs_nancheck( int matrix_layout, lapack_int n,
const float *a,
lapack_int lda );
lapack_logical LAPACKE_spb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const float* ab,
lapack_int ldab );
lapack_logical LAPACKE_spf_nancheck( lapack_int n,
const float *a );
lapack_logical LAPACKE_spo_nancheck( int matrix_layout, char uplo,
lapack_int n,
const float *a,
lapack_int lda );
lapack_logical LAPACKE_spp_nancheck( lapack_int n,
const float *ap );
lapack_logical LAPACKE_spt_nancheck( lapack_int n,
const float *d,
const float *e );
lapack_logical LAPACKE_ssb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const float* ab,
lapack_int ldab );
lapack_logical LAPACKE_ssp_nancheck( lapack_int n,
const float *ap );
lapack_logical LAPACKE_sst_nancheck( lapack_int n,
const float *d,
const float *e );
lapack_logical LAPACKE_ssy_nancheck( int matrix_layout, char uplo,
lapack_int n,
const float *a,
lapack_int lda );
lapack_logical LAPACKE_stb_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const float* ab,
lapack_int ldab );
lapack_logical LAPACKE_stf_nancheck( int matrix_layout, char transr,
char uplo, char diag,
lapack_int n,
const float *a );
lapack_logical LAPACKE_stp_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const float *ap );
lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const float *a,
lapack_int lda );
lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const float *a, lapack_int lda );

lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m,
lapack_int n, lapack_int kl,
lapack_int ku,
const lapack_complex_double *ab,
lapack_int ldab );
lapack_logical LAPACKE_zge_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_zgg_nancheck( int matrix_layout, lapack_int m,
lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_zgt_nancheck( lapack_int n,
const lapack_complex_double *dl,
const lapack_complex_double *d,
const lapack_complex_double *du );
lapack_logical LAPACKE_zhb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const lapack_complex_double* ab,
lapack_int ldab );
lapack_logical LAPACKE_zhe_nancheck( int matrix_layout, char uplo,
lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_zhp_nancheck( lapack_int n,
const lapack_complex_double *ap );
lapack_logical LAPACKE_zhs_nancheck( int matrix_layout, lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_zpb_nancheck( int matrix_layout, char uplo,
lapack_int n, lapack_int kd,
const lapack_complex_double* ab,
lapack_int ldab );
lapack_logical LAPACKE_zpf_nancheck( lapack_int n,
const lapack_complex_double *a );
lapack_logical LAPACKE_zpo_nancheck( int matrix_layout, char uplo,
lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_zpp_nancheck( lapack_int n,
const lapack_complex_double *ap );
lapack_logical LAPACKE_zpt_nancheck( lapack_int n,
const double *d,
const lapack_complex_double *e );
lapack_logical LAPACKE_zsp_nancheck( lapack_int n,
const lapack_complex_double *ap );
lapack_logical LAPACKE_zst_nancheck( lapack_int n,
const lapack_complex_double *d,
const lapack_complex_double *e );
lapack_logical LAPACKE_zsy_nancheck( int matrix_layout, char uplo,
lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_ztb_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n, lapack_int kd,
const lapack_complex_double* ab,
lapack_int ldab );
lapack_logical LAPACKE_ztf_nancheck( int matrix_layout, char transr,
char uplo, char diag,
lapack_int n,
const lapack_complex_double *a );
lapack_logical LAPACKE_ztp_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const lapack_complex_double *ap );
lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag,
lapack_int n,
const lapack_complex_double *a,
lapack_int lda );
lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo,
char diag, lapack_int m, lapack_int n,
const lapack_complex_double *a,
lapack_int lda );

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* _LAPACKE_UTILS_H_ */

+ 136
- 0
install/include/openblas_config.h View File

@@ -0,0 +1,136 @@
#ifndef OPENBLAS_CONFIG_H
#define OPENBLAS_CONFIG_H
#define OPENBLAS_OS_LINUX 1
#define OPENBLAS_ARCH_RISCV64 1
#define OPENBLAS_C_GCC 1
#define OPENBLAS___64BIT__ 1
#define OPENBLAS_HAVE_C11 1
#define OPENBLAS_PTHREAD_CREATE_FUNC pthread_create
#define OPENBLAS_BUNDERSCORE _
#define OPENBLAS_NEEDBUNDERSCORE 1
#define OPENBLAS_RISCV64_ZVL128B
#define OPENBLAS_L1_DATA_SIZE 32768
#define OPENBLAS_L1_DATA_LINESIZE 32
#define OPENBLAS_L2_SIZE 1048576
#define OPENBLAS_L2_LINESIZE 32
#define OPENBLAS_DTB_DEFAULT_ENTRIES 128
#define OPENBLAS_DTB_SIZE 4096
#define OPENBLAS_L2_ASSOCIATIVE 4
#define OPENBLAS_CORE_RISCV64_ZVL128B
#define OPENBLAS_CHAR_CORENAME "RISCV64_ZVL128B"
#define OPENBLAS_GEMM_MULTITHREAD_THRESHOLD 4
#define OPENBLAS_VERSION " OpenBLAS 0.3.29.dev "
/*This is only for "make install" target.*/

#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
#define OPENBLAS_WINDOWS_ABI
#define OPENBLAS_OS_WINDOWS

#ifdef DOUBLE
#define DOUBLE_DEFINED DOUBLE
#undef DOUBLE
#endif
#endif

#ifdef OPENBLAS_NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
#else
#define BLASFUNC(FUNC) FUNC
#endif

#ifdef OPENBLAS_QUAD_PRECISION
typedef struct {
unsigned long x[2];
} xdouble;
#elif defined OPENBLAS_EXPRECISION
#define xdouble long double
#else
#define xdouble double
#endif

#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
typedef long long BLASLONG;
typedef unsigned long long BLASULONG;
#else
typedef long BLASLONG;
typedef unsigned long BLASULONG;
#endif

#ifndef BFLOAT16
#include <stdint.h>
typedef uint16_t bfloat16;
#endif

#if defined(__GNUC__) && (__GNUC__ >= 12)
typedef _Float16 hfloat16;
#else
#include <stdint.h>
typedef uint16_t hfloat16;
#endif

#ifdef OPENBLAS_USE64BITINT
typedef BLASLONG blasint;
#else
typedef int blasint;
#endif

#if defined(XDOUBLE) || defined(DOUBLE)
#define FLOATRET FLOAT
#else
#ifdef NEED_F2CCONV
#define FLOATRET double
#else
#define FLOATRET float
#endif
#endif

/* Inclusion of a standard header file is needed for definition of __STDC_*
predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs
as a side effect of including either <features.h> or <stdc-predef.h>. */
#include <stdio.h>

/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
#define OPENBLAS_COMPLEX_C99
#ifndef __cplusplus
#include <complex.h>
#endif
typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double;
typedef xdouble _Complex openblas_complex_xdouble;
#define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I))
#define openblas_complex_float_real(z) (creal(z))
#define openblas_complex_float_imag(z) (cimag(z))
#define openblas_complex_double_real(z) (creal(z))
#define openblas_complex_double_imag(z) (cimag(z))
#define openblas_complex_xdouble_real(z) (creal(z))
#define openblas_complex_xdouble_imag(z) (cimag(z))
#else
#define OPENBLAS_COMPLEX_STRUCT
typedef struct { float real, imag; } openblas_complex_float;
typedef struct { double real, imag; } openblas_complex_double;
typedef struct { xdouble real, imag; } openblas_complex_xdouble;
#define openblas_make_complex_float(real, imag) {(real), (imag)}
#define openblas_make_complex_double(real, imag) {(real), (imag)}
#define openblas_make_complex_xdouble(real, imag) {(real), (imag)}
#define openblas_complex_float_real(z) ((z).real)
#define openblas_complex_float_imag(z) ((z).imag)
#define openblas_complex_double_real(z) ((z).real)
#define openblas_complex_double_imag(z) ((z).imag)
#define openblas_complex_xdouble_real(z) ((z).real)
#define openblas_complex_xdouble_imag(z) ((z).imag)
#endif

/* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */
#ifdef OPENBLAS_OS_LINUX
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <sched.h>
#endif
#endif /* OPENBLAS_CONFIG_H */

+ 4
- 0
install/lib/cmake/openblas/OpenBLASConfig.cmake View File

@@ -0,0 +1,4 @@
SET(OpenBLAS_VERSION "0.3.29.dev")
file(REAL_PATH "../../.." _OpenBLAS_ROOT_DIR BASE_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} )
SET(OpenBLAS_INCLUDE_DIRS ${_OpenBLAS_ROOT_DIR}/include)
SET(OpenBLAS_LIBRARIES ${_OpenBLAS_ROOT_DIR}/lib/libopenblas.so)

+ 9
- 0
install/lib/cmake/openblas/OpenBLASConfigVersion.cmake View File

@@ -0,0 +1,9 @@
set (PACKAGE_VERSION "0.3.29.dev")
if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
set (PACKAGE_VERSION_COMPATIBLE FALSE)
else ()
set (PACKAGE_VERSION_COMPATIBLE TRUE)
if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
set (PACKAGE_VERSION_EXACT TRUE)
endif ()
endif ()

+ 16
- 0
install/lib/pkgconfig/openblas.pc View File

@@ -0,0 +1,16 @@
libdir=/home/da/OpenBLAS/install/lib
libprefix=
libnamesuffix=
libsuffix=
includedir=/home/da/OpenBLAS/install/include
omp_opt=
openblas_config= USE_64BITINT= DYNAMIC_ARCH= DYNAMIC_OLDER= NO_CBLAS= NO_LAPACK= NO_LAPACKE= NO_AFFINITY=1 USE_OPENMP= RISCV64_ZVL128B MAX_THREADS=32
version=0.3.29.dev
extralib=-lm -lpthread -lgfortran -lm -lpthread -lgfortran
Name: openblas
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: ${version}
URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -l${libprefix}openblas${libsuffix}${libnamesuffix}
Libs.private: ${extralib}
Cflags: -I${includedir} ${omp_opt}

BIN
install/test_shgemm View File


+ 45
- 0
install/test_shgemm.c View File

@@ -0,0 +1,45 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cblas.h>
#include <riscv_vector.h>

void print_matrix(float *C, int M, int N) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++) {
printf("%f ", C[i * N + j]);
}
printf("\n");
}
}

int main() {
const int M = 2, N = 2, K = 2;
const float alpha = 1.0f;
const float beta = 0.0f;

// A[M x K], row-major
hfloat16 A[4] = {1.0, 2.0,
3.0, 4.0};

// B[K x N], row-major
hfloat16 B[4] = {5.0, 6.0,
7.0, 8.0};

// C[M x N], row-major
float C[4] = {0};

// Call OpenBLAS float16 GEMM
cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
M, N, K,
alpha,
A, K, // lda = K
B, N, // ldb = N
beta,
C, N); // ldc = N

printf("Result C = A*B:\n");
print_matrix(C, M, N);
return 0;
}


BIN
install/zvl_test View File


+ 22
- 0
install/zvl_test.c View File

@@ -0,0 +1,22 @@
#include <riscv_vector.h>
#include <stdio.h>
#include <stdlib.h>

int main(){
unsigned int gvl = __riscv_vsetvl_e32m2(8);
float *A = (float *)malloc(4 * 4 * sizeof(float));
for (int i =0;i<4*4;i++){
A[i]=i%10;
}
vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[0], gvl);
float tmp[8];
__riscv_vse32_v_f32m2(tmp, A0, gvl);

printf("A0 vector contents:\n");
for (int i = 0; i < gvl; i++) {
printf("tmp[%d] = %.2f\n", i, tmp[i]);
}

free(A);
return 0;
}

+ 3
- 0
interface/CMakeLists.txt View File

@@ -136,6 +136,9 @@ if (BUILD_BFLOAT16)
GenerateNamedObjects("gemm_batch.c" "" "sbgemm_batch" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm_batch.c" "" "sbgemm_batch" ${CBLAS_FLAG} "" "" true "BFLOAT16")
endif () endif ()
endif () endif ()
if (BUILD_HFLOAT16)
GenerateNamedObjects("gemm.c" "" "shgemm" ${CBLAS_FLAG} "" "" true "HFLOAT16")
endif ()


# complex-specific sources # complex-specific sources
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})


+ 22
- 2
interface/Makefile View File

@@ -53,6 +53,10 @@ SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX)
SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
endif endif


ifeq ($(BUILD_HFLOAT16),1)
SHBLAS3OBJS = shgemm.$(SUFFIX)
endif

DBLAS1OBJS = \ DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \ dcopy.$(SUFFIX) dscal.$(SUFFIX) \
@@ -291,6 +295,10 @@ CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(S
CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
endif endif


ifeq ($(BUILD_HFLOAT16),1)
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
endif

CDBLAS1OBJS = \ CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
@@ -388,6 +396,7 @@ SBLAS3OBJS += $(CSBLAS3OBJS)
SBBLAS1OBJS += $(CSBBLAS1OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS)
SBBLAS2OBJS += $(CSBBLAS2OBJS) SBBLAS2OBJS += $(CSBBLAS2OBJS)
SBBLAS3OBJS += $(CSBBLAS3OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS)
SHBLAS3OBJS += $(CSHBLAS3OBJS)
DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS1OBJS += $(CDBLAS1OBJS)
DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS2OBJS += $(CDBLAS2OBJS)
DBLAS3OBJS += $(CDBLAS3OBJS) DBLAS3OBJS += $(CDBLAS3OBJS)
@@ -405,6 +414,7 @@ endif


SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
SHBLASOBJS = $(SHBLAS3OBJS)
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@@ -512,7 +522,7 @@ ifneq ($(BUILD_COMPLEX16),1)
ZBLASOBJS= ZBLASOBJS=
endif endif


FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(SHBLASOBJS)


ifeq ($(EXPRECISION), 1) ifeq ($(EXPRECISION), 1)
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
@@ -550,7 +560,7 @@ level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $
level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^


level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS)
level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(SHBLAS3OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^


aux : $(CBAUXOBJS) aux : $(CBAUXOBJS)
@@ -1309,6 +1319,11 @@ sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)
endif endif


ifeq ($(BUILD_HFLOAT16),1)
shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
endif

sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)


@@ -1968,6 +1983,11 @@ cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif endif


ifeq ($(BUILD_HFLOAT16),1)
cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif

cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)




+ 55
- 0
kernel/CMakeLists.txt View File

@@ -351,6 +351,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
endif () endif ()
if (BUILD_HFLOAT16)
if (SHGEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMINCOPY}" "" "${SHGEMMINCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
if (SHGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMITCOPY}" "" "${SHGEMMITCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
if (SHGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMONCOPY}" "" "${SHGEMMONCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
if (SHGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${SHGEMMOTCOPY}" "" "${SHGEMMOTCOPYOBJ}" false "" "" true "HFLOAT16")
endif ()
GenerateNamedObjects("${KERNELDIR}/${SHGEMMKERNEL}" "" "gemm_kernel" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_BETA}" "" "gemm_beta" false "" "" false "HFLOAT16")
endif ()
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_char}GEMMINCOPY) if (${float_char}GEMMINCOPY)
@@ -769,6 +785,45 @@ endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
endif () endif ()

if (BUILD_HFLOAT16)
if (NOT DEFINED SHGEMM_SMALL_M_PERMIT)
set(SHGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_NN)
set(SHGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_NT)
set(SHGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_TN)
set(SHGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_TT)
set(SHGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_NN)
set(SHGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_NT)
set(SHGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_TN)
set(SHGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SHGEMM_SMALL_K_B0_TT)
set(SHGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "HFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "HFLOAT16")
endif ()
endif () endif ()


if (NOT DEFINED ${float_char}OMATCOPY_CN) if (NOT DEFINED ${float_char}OMATCOPY_CN)


+ 164
- 0
kernel/Makefile.L3 View File

@@ -129,6 +129,26 @@ SBKERNELOBJS += \
$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
endif endif


ifeq ($(BUILD_HFLOAT16), 1)
ifndef SHGEMMKERNEL
SHGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif

SHKERNELOBJS += \
shgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
endif

ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
SKERNELOBJS += \ SKERNELOBJS += \
sgemm_kernel$(TSUFFIX).$(SUFFIX) \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \
@@ -192,6 +212,9 @@ XKERNELOBJS += \
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += $(SBKERNELOBJS) SBBLASOBJS += $(SBKERNELOBJS)
endif endif
ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += $(SHKERNELOBJS)
endif
SBLASOBJS += $(SKERNELOBJS) SBLASOBJS += $(SKERNELOBJS)
DBLASOBJS += $(DKERNELOBJS) DBLASOBJS += $(DKERNELOBJS)
QBLASOBJS += $(QKERNELOBJS) QBLASOBJS += $(QKERNELOBJS)
@@ -202,6 +225,9 @@ XBLASOBJS += $(XKERNELOBJS)
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
endif endif
ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
endif


ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
SBLASOBJS += \ SBLASOBJS += \
@@ -493,6 +519,15 @@ SBBLASOBJS += \
sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
endif endif


ifeq ($(BUILD_HFLOAT16),1)
SHBLASOBJS += \
shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
endif

SBLASOBJS += \ SBLASOBJS += \
sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
@@ -599,6 +634,13 @@ SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
endif endif


ifeq ($(BUILD_HFLOAT16), 1)
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
endif

SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@@ -629,6 +671,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif


ifeq ($(BUILD_HFLOAT16),1)
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif

$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@


@@ -671,6 +718,25 @@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
endif endif
endif endif


ifeq ($(BUILD_HFLOAT16), 1)

$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))

$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

endif
endif

$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@


@@ -853,6 +919,12 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMM
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif


ifeq ($(BUILD_HFLOAT16), 1)

$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif

$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
ifeq ($(OS), AIX) ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
@@ -2840,6 +2912,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif


ifeq ($(BUILD_HFLOAT16),1)
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif

$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@


@@ -2873,6 +2950,23 @@ $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
endif endif
endif endif


ifeq ($(BUILD_HFLOAT16), 1)
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

endif
endif

$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@


@@ -2983,6 +3077,11 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEM
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif


ifeq ($(BUILD_HFLOAT16), 1)
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif

$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@


@@ -4843,6 +4942,71 @@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMA
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
endif endif


ifeq ($(BUILD_HFLOAT16), 1)
ifndef SHGEMM_SMALL_M_PERMIT
SHGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
endif

ifndef SHGEMM_SMALL_K_NN
SHGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif

ifndef SHGEMM_SMALL_K_NT
SHGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif

ifndef SHGEMM_SMALL_K_TN
SHGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif

ifndef SHGEMM_SMALL_K_TT
SHGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif

$(KDIR)shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

ifndef SHGEMM_SMALL_K_B0_NN
SHGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif

ifndef SHGEMM_SMALL_K_B0_NT
SHGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif

ifndef SHGEMM_SMALL_K_B0_TN
SHGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif

ifndef SHGEMM_SMALL_K_B0_TT
SHGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif

$(KDIR)shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@

$(KDIR)shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@

$(KDIR)shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@

$(KDIR)shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
endif

ifndef CGEMM_SMALL_M_PERMIT ifndef CGEMM_SMALL_M_PERMIT
CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
endif endif


+ 9
- 0
kernel/riscv64/KERNEL.RISCV64_ZVL128B View File

@@ -245,3 +245,12 @@ endif
ifndef ZGEMM_BETA ifndef ZGEMM_BETA
ZGEMM_BETA = zgemm_beta_rvv.c ZGEMM_BETA = zgemm_beta_rvv.c
endif endif

SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifndef SHGEMM_BETA
SHGEMM_BETA = gemm_beta_rvv.c
endif

+ 16
- 0
kernel/riscv64/KERNEL.RISCV64_ZVL256B View File

@@ -207,3 +207,19 @@ COMATCOPY_CN = zomatcopy_cn_vector.c


DOMATCOPY_CN = omatcopy_cn_vector.c DOMATCOPY_CN = omatcopy_cn_vector.c
SOMATCOPY_CN = omatcopy_cn_vector.c SOMATCOPY_CN = omatcopy_cn_vector.c


SHGEMMKERNEL = shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
SHGEMMINCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c
SHGEMMITCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_M).c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SHGEMMONCOPY = ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
SHGEMMOTCOPY = ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifndef SHGEMM_BETA
SHGEMM_BETA = gemm_beta_rvv.c
endif

+ 810
- 559
kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
File diff suppressed because it is too large
View File


+ 33
- 33
kernel/riscv64/shgemm_kernel_8x8_zvl128b.c View File

@@ -1,5 +1,6 @@


#include "common.h" #include "common.h"
#include <riscv_vector.h>


int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc)
{ {
@@ -14,7 +15,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


for (BLASLONG i=0; i<M/8; i+=1) { for (BLASLONG i=0; i<M/8; i+=1) {
BLASLONG ai=m_top*K; BLASLONG ai=m_top*K;
BLASLONG bi=n_top*K;
BLASLONG bi=n_top*K;
_Float16 B0 = B[bi+0]; _Float16 B0 = B[bi+0];
_Float16 B1 = B[bi+1]; _Float16 B1 = B[bi+1];
@@ -50,17 +51,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
bi += 8; bi += 8;


A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl ); A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 16;
ai += 8;


result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
result4 = __riscv_vfwmacc_vf_f32m2(result4, A0, B4, gvl);
result5 = __riscv_vfwmacc_vf_f32m2(result5, A0, B5, gvl);
result6 = __riscv_vfwmacc_vf_f32m2(result6, A0, B6, gvl);
result7 = __riscv_vfwmacc_vf_f32m2(result7, A0, B7, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
} }


@@ -86,14 +87,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


ci = n_top * ldc + m_top; ci = n_top * ldc + m_top;


__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c7, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c3, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c4, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c5, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c6, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c7, gvl); ci += ldc-gvl*0;
m_top += 8; m_top += 8;
} }


@@ -332,10 +333,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl ); A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 8; ai += 8;


result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
} }


@@ -353,10 +354,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


ci = n_top * ldc + m_top; ci = n_top * ldc + m_top;


__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c3, gvl);
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c3, gvl);
m_top += 8; m_top += 8;
} }


@@ -521,8 +522,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl ); A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 8; ai += 8;


result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
} }




@@ -536,8 +537,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


ci = n_top * ldc + m_top; ci = n_top * ldc + m_top;


__riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse16_v_f16m1( &C[ci], c1, gvl);
__riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
__riscv_vse32_v_f32m2( &C[ci], c1, gvl);
m_top += 8; m_top += 8;
} }


@@ -604,7 +605,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
bi+=2; bi+=2;
} }
BLASLONG ci=n_top*ldc+m_top; BLASLONG ci=n_top*ldc+m_top;
C[ci + 0 * ldc + 0] += alpha * result0; C[ci + 0 * ldc + 0] += alpha * result0;
C[ci + 0 * ldc + 1] += alpha * result1; C[ci + 0 * ldc + 1] += alpha * result1;
@@ -665,7 +665,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl ); A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
ai += 8; ai += 8;


result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
} }




@@ -677,7 +677,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,


ci = n_top * ldc + m_top; ci = n_top * ldc + m_top;


__riscv_vse16_v_f16m1( &C[ci], c0, gvl);
__riscv_vse32_v_f32m2( &C[ci], c0, gvl);
m_top += 8; m_top += 8;
} }




+ 37
- 0
kernel/setparam-ref.c View File

@@ -125,6 +125,23 @@ gotoblas_t TABLE_NAME = {
#endif #endif
#endif #endif


#ifdef BUILD_HFLOAT16
0, 0, 0,
SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N,
#ifdef SHGEMM_DEFAULT_UNROLL_MN
SHGEMM_DEFAULT_UNROLL_MN,
#else
MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N),
#endif
shgemm_kernelTS, shgemm_betaTS,
#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N
shgemm_incopyTS, shgemm_itcopyTS,
#else
shgemm_oncopyTS, shgemm_otcopyTS,
#endif
shgemm_oncopyTS, shgemm_otcopyTS,
#endif

#if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) #if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
0, 0, 0, 0, 0, 0,
SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
@@ -1252,6 +1269,9 @@ static void init_parameter(void) {


#ifdef BUILD_BFLOAT16 #ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
#endif #endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
@@ -1260,6 +1280,9 @@ static void init_parameter(void) {


#ifdef BUILD_BFLOAT16 #ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
#endif #endif
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
@@ -1269,6 +1292,9 @@ static void init_parameter(void) {


#ifdef BUILD_BFLOAT16 #ifdef BUILD_BFLOAT16
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
#endif #endif
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
@@ -1417,6 +1443,10 @@ static void init_parameter(void) {
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif #endif
#ifdef BUILD_HFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
#endif
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
#endif #endif
@@ -2012,6 +2042,13 @@ static void init_parameter(void) {
) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15); ) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15);
#endif #endif


#if BUILD_HFLOAT16==1
TABLE_NAME.shgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.shgemm_p * TABLE_NAME.shgemm_q * 4 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.shgemm_q * 4) - 15) & ~15);
#endif

#if BUILD_SINGLE==1 #if BUILD_SINGLE==1
TABLE_NAME.sgemm_r = (((BUFFER_SIZE - TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA


+ 1
- 0
lapack/CMakeLists.txt View File

@@ -3,6 +3,7 @@ include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR}) include_directories(${PROJECT_BINARY_DIR})


list (REMOVE_ITEM FLOAT_TYPES "BFLOAT16") list (REMOVE_ITEM FLOAT_TYPES "BFLOAT16")
list (REMOVE_ITEM FLOAT_TYPES "HFLOAT16")


set(LAPACK_SOURCES set(LAPACK_SOURCES
potrf/potrf_U_single.c potrf/potrf_U_single.c


+ 3
- 1
openblas_config_template.h View File

@@ -39,7 +39,9 @@ typedef unsigned long BLASULONG;
typedef uint16_t bfloat16; typedef uint16_t bfloat16;
#endif #endif


#ifndef HFLOAT16
#if defined(__GNUC__) && (__GNUC__ >= 12)
typedef _Float16 hfloat16;
#else
#include <stdint.h> #include <stdint.h>
typedef uint16_t hfloat16; typedef uint16_t hfloat16;
#endif #endif


+ 1
- 0
param.h View File

@@ -74,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#define SHGEMM_DEFAULT_UNROLL_N 8 #define SHGEMM_DEFAULT_UNROLL_N 8
#define SHGEMM_DEFAULT_UNROLL_M 8 #define SHGEMM_DEFAULT_UNROLL_M 8
#define SHGEMM_DEFAULT_UNROLL_MN 32
#define SHGEMM_DEFAULT_P 128 #define SHGEMM_DEFAULT_P 128
#define SHGEMM_DEFAULT_R 240 #define SHGEMM_DEFAULT_R 240
#define SHGEMM_DEFAULT_Q 12288 #define SHGEMM_DEFAULT_Q 12288


Loading…
Cancel
Save