Browse Source

Add infrastructure for BGEMM

Setting up all the infrastructure for BGEMM support in OpenBLAS, hopefully I found all the right places.

Derived mostly from the previous work done in https://github.com/OpenMathLib/OpenBLAS/pull/5287

Co-authored-by: Ye Tao <ye.tao@arm.com>
pull/5357/head
Chris Sidebottom 3 months ago
parent
commit
f95e7b0e32
23 changed files with 797 additions and 56 deletions
  1. +2
    -0
      .gitignore
  2. +1
    -0
      CONTRIBUTORS.md
  3. +6
    -4
      Makefile.system
  4. +5
    -2
      Makefile.tail
  5. +30
    -0
      cblas.h
  6. +2
    -2
      cmake/system.cmake
  7. +6
    -1
      common.h
  8. +85
    -0
      common_b.h
  9. +3
    -0
      common_interface.h
  10. +17
    -0
      common_level3.h
  11. +50
    -1
      common_macro.h
  12. +43
    -1
      common_param.h
  13. +55
    -0
      driver/level3/Makefile
  14. +32
    -0
      getarch_2nd.c
  15. +9
    -1
      interface/Makefile
  16. +6
    -1
      interface/gemm.c
  17. +66
    -2
      kernel/Makefile.L3
  18. +50
    -19
      kernel/generic/gemm_beta.c
  19. +42
    -18
      kernel/generic/gemmkernel_2x2.c
  20. +14
    -1
      kernel/setparam-ref.c
  21. +9
    -1
      param.h
  22. +40
    -2
      test/Makefile
  23. +224
    -0
      test/compare_sgemm_bgemm.c

+ 2
- 0
.gitignore View File

@@ -82,6 +82,7 @@ test/ZBLAT3.SUMM
test/ZBLAT3_3M.SUMM
test/SHBLAT3.SUMM
test/SBBLAT3.SUMM
test/BBLAT3.SUMM
test/cblat1
test/cblat2
test/cblat3
@@ -96,6 +97,7 @@ test/sblat3
test/sblat3_3m
test/test_shgemm
test/test_sbgemm
test/test_bgemm
test/zblat1
test/zblat2
test/zblat3


+ 1
- 0
CONTRIBUTORS.md View File

@@ -251,6 +251,7 @@ In chronological order:
* Ye Tao <ye.tao@arm.com>
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1
* [2025-02-27] Add sbgemv_n_neon kernel
* [2025-05-17] Impl prototype of BGEMM inferface

* Abhishek Kumar <https://github.com/abhishek-iitmadras>
* [2025-04-22] Optimise dot kernel for NEOVERSE V1


+ 6
- 4
Makefile.system View File

@@ -276,14 +276,14 @@ SMALL_MATRIX_OPT = 1
endif
ifeq ($(ARCH), arm64)
GEMM_GEMV_FORWARD = 1
GEMM_GEMV_FORWARD_BF16 = 1
SBGEMM_GEMV_FORWARD = 1
endif
ifeq ($(ARCH), riscv)
GEMM_GEMV_FORWARD = 1
endif
ifeq ($(ARCH), power)
GEMM_GEMV_FORWARD = 1
GEMM_GEMV_FORWARD_BF16 = 1
SBGEMM_GEMV_FORWARD = 1
endif

ifeq ($(SMALL_MATRIX_OPT), 1)
@@ -293,8 +293,8 @@ ifneq ($(ONLY_CBLAS), 1)
ifeq ($(GEMM_GEMV_FORWARD), 1)
CCOMMON_OPT += -DGEMM_GEMV_FORWARD
endif
ifeq ($(GEMM_GEMV_FORWARD_BF16), 1)
CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16
ifeq ($(SBGEMM_GEMV_FORWARD), 1)
CCOMMON_OPT += -DSBGEMM_GEMV_FORWARD
endif
endif

@@ -1905,6 +1905,8 @@ export BUILD_HFLOAT16
export NO_LSX
export NO_LASX

export BGEMM_UNROLL_M
export BGEMM_UNROLL_N
export SBGEMM_UNROLL_M
export SBGEMM_UNROLL_N
export SHGEMM_UNROLL_M


+ 5
- 2
Makefile.tail View File

@@ -1,3 +1,4 @@
BBLASOBJS_P = $(BBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
SHBLASPBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
@@ -12,8 +13,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))

HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))

BLASOBJS = $(SHBLASOBJS) $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS)
BLASOBJS_P = $(SHBLASPBJS_P) $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P)
BLASOBJS = $(SHBLASOBJS) $(BBLASOBJS) $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS)
BLASOBJS_P = $(SHBLASPBJS_P) $(BBLASOBJS_P) $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P)

ifdef EXPRECISION
BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
@@ -26,6 +27,7 @@ BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif

$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHFLOAT16 -UDOUBLE -UCOMPLEX
$(BBLASOBJS) $(BBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -USMALL_MATRIX_OPT
$(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
@@ -36,6 +38,7 @@ $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(SBEXTOBJS) $(SBEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX

$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(BBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)


+ 30
- 0
cblas.h View File

@@ -1,3 +1,31 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#ifndef CBLAS_H
#define CBLAS_H

@@ -441,6 +469,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);

void cblas_bgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST bfloat16 alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST bfloat16 beta, bfloat16 *C, OPENBLAS_CONST blasint ldc);
void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,


+ 2
- 2
cmake/system.cmake View File

@@ -425,8 +425,8 @@ endif ()
if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD")
endif ()
if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16")
if (SBGEMM_GEMV_FORWARD AND NOT ONLY_CBLAS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DSBGEMM_GEMV_FORWARD")
endif ()
if (SMALL_MATRIX_OPT)
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")


+ 6
- 1
common.h View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -317,7 +318,11 @@ typedef int blasint;
#elif defined(BFLOAT16)
#define IFLOAT bfloat16
#define XFLOAT IFLOAT
#define FLOAT float
#ifdef BGEMM
#define FLOAT bfloat16
#else
#define FLOAT float
#endif
#define SIZE 2
#define BASE_SHIFT 1
#define ZBASE_SHIFT 2


+ 85
- 0
common_b.h View File

@@ -0,0 +1,85 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#ifndef COMMON_B_H
#define COMMON_B_H

#ifndef DYNAMIC_ARCH
#define BGEMM_ONCOPY bgemm_oncopy
#define BGEMM_OTCOPY bgemm_otcopy
#define BGEMM_INCOPY bgemm_incopy
#define BGEMM_ITCOPY bgemm_itcopy

#define BGEMM_BETA bgemm_beta
#define BGEMM_KERNEL bgemm_kernel

#else

#define BGEMM_ONCOPY gotoblas->bgemm_oncopy
#define BGEMM_OTCOPY gotoblas->bgemm_otcopy
#define BGEMM_INCOPY gotoblas->bgemm_incopy
#define BGEMM_ITCOPY gotoblas->bgemm_itcopy
#define BGEMM_BETA gotoblas->bgemm_beta
#define BGEMM_KERNEL gotoblas->bgemm_kernel

#endif

#define BGEMM_NN bgemm_nn
#define BGEMM_CN bgemm_tn
#define BGEMM_TN bgemm_tn
#define BGEMM_NC bgemm_nt
#define BGEMM_NT bgemm_nt
#define BGEMM_CC bgemm_tt
#define BGEMM_CT bgemm_tt
#define BGEMM_TC bgemm_tt
#define BGEMM_TT bgemm_tt
#define BGEMM_NR bgemm_nn
#define BGEMM_TR bgemm_tn
#define BGEMM_CR bgemm_tn
#define BGEMM_RN bgemm_nn
#define BGEMM_RT bgemm_nt
#define BGEMM_RC bgemm_nt
#define BGEMM_RR bgemm_nn

#define BGEMM_THREAD_NN bgemm_thread_nn
#define BGEMM_THREAD_CN bgemm_thread_tn
#define BGEMM_THREAD_TN bgemm_thread_tn
#define BGEMM_THREAD_NC bgemm_thread_nt
#define BGEMM_THREAD_NT bgemm_thread_nt
#define BGEMM_THREAD_CC bgemm_thread_tt
#define BGEMM_THREAD_CT bgemm_thread_tt
#define BGEMM_THREAD_TC bgemm_thread_tt
#define BGEMM_THREAD_TT bgemm_thread_tt
#define BGEMM_THREAD_NR bgemm_thread_nn
#define BGEMM_THREAD_TR bgemm_thread_tn
#define BGEMM_THREAD_CR bgemm_thread_tn
#define BGEMM_THREAD_RN bgemm_thread_nn
#define BGEMM_THREAD_RT bgemm_thread_nt
#define BGEMM_THREAD_RC bgemm_thread_nt
#define BGEMM_THREAD_RR bgemm_thread_nn
#endif

+ 3
- 0
common_interface.h View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -483,6 +484,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint

void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
hfloat16 *, blasint *, hfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(bgemm)(char *, char *, blasint *, blasint *, blasint *, bfloat16 *,
bfloat16 *, blasint *, bfloat16 *, blasint *, bfloat16 *, bfloat16 *, blasint *);
void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,


+ 17
- 0
common_level3.h View File

@@ -56,6 +56,8 @@ int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);

int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
hfloat16 *, BLASLONG, hfloat16 *, BLASLONG, float *, BLASLONG);
int bgemm_beta(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
@@ -83,6 +85,10 @@ int shgemm_incopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b
int shgemm_itcopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b);
int shgemm_oncopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b);
int shgemm_otcopy(BLASLONG m, BLASLONG n, hfloat16 *a, BLASLONG lda, hfloat16 *b);
int bgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int bgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
@@ -511,6 +517,7 @@ int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);

int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, hfloat16 *, hfloat16 *, float *, BLASLONG);
int bgemm_kernel(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
@@ -668,6 +675,11 @@ int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLAS
int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);
int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);

int bgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);

int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
@@ -770,6 +782,11 @@ int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16
int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);
int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG);

int bgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int bgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);

int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);


+ 50
- 1
common_macro.h View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -40,6 +41,7 @@
#define COMMON_MACRO

#include "common_sh.h"
#include "common_b.h"
#include "common_sb.h"
#include "common_s.h"
#include "common_d.h"
@@ -702,8 +704,52 @@
#define GEMM_THREAD_RR SHGEMM_THREAD_NN


#elif defined(BFLOAT16)
#elif defined(BFLOAT16) && defined(BGEMM)
#define GEMM_BETA BGEMM_BETA
#define GEMM_KERNEL_N BGEMM_KERNEL
#define GEMM_KERNEL_L BGEMM_KERNEL
#define GEMM_KERNEL_R BGEMM_KERNEL
#define GEMM_KERNEL_B BGEMM_KERNEL

#define GEMM_NN BGEMM_NN
#define GEMM_CN BGEMM_TN
#define GEMM_TN BGEMM_TN
#define GEMM_NC BGEMM_NT
#define GEMM_NT BGEMM_NT
#define GEMM_CC BGEMM_TT
#define GEMM_CT BGEMM_TT
#define GEMM_TC BGEMM_TT
#define GEMM_TT BGEMM_TT
#define GEMM_NR BGEMM_NN
#define GEMM_TR BGEMM_TN
#define GEMM_CR BGEMM_TN
#define GEMM_RN BGEMM_NN
#define GEMM_RT BGEMM_NT
#define GEMM_RC BGEMM_NT
#define GEMM_RR BGEMM_NN
#define GEMM_ONCOPY BGEMM_ONCOPY
#define GEMM_OTCOPY BGEMM_OTCOPY
#define GEMM_INCOPY BGEMM_INCOPY
#define GEMM_ITCOPY BGEMM_ITCOPY

#define GEMM_THREAD_NN BGEMM_THREAD_NN
#define GEMM_THREAD_CN BGEMM_THREAD_TN
#define GEMM_THREAD_TN BGEMM_THREAD_TN
#define GEMM_THREAD_NC BGEMM_THREAD_NT
#define GEMM_THREAD_NT BGEMM_THREAD_NT
#define GEMM_THREAD_CC BGEMM_THREAD_TT
#define GEMM_THREAD_CT BGEMM_THREAD_TT
#define GEMM_THREAD_TC BGEMM_THREAD_TT
#define GEMM_THREAD_TT BGEMM_THREAD_TT
#define GEMM_THREAD_NR BGEMM_THREAD_NN
#define GEMM_THREAD_TR BGEMM_THREAD_TN
#define GEMM_THREAD_CR BGEMM_THREAD_TN
#define GEMM_THREAD_RN BGEMM_THREAD_NN
#define GEMM_THREAD_RT BGEMM_THREAD_NT
#define GEMM_THREAD_RC BGEMM_THREAD_NT
#define GEMM_THREAD_RR BGEMM_THREAD_NN

#elif defined(BFLOAT16)
#define D_TO_BF16_K SBDTOBF16_K
#define D_BF16_TO_K DBF16TOD_K
#define S_TO_BF16_K SBSTOBF16_K
@@ -2663,6 +2709,9 @@
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA))
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG bgemm_p;
extern BLASLONG bgemm_q;
extern BLASLONG bgemm_r;
extern BLASLONG sbgemm_p;
extern BLASLONG sbgemm_q;
extern BLASLONG sbgemm_r;


+ 43
- 1
common_param.h View File

@@ -1,6 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* Copyright 2023, 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -65,6 +65,10 @@ int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *);


#if BUILD_BFLOAT16 == 1
int bgemm_p, bgemm_q, bgemm_r;
int bgemm_unroll_m, bgemm_unroll_n, bgemm_unroll_mn;
int bgemm_align_k;

int sbgemm_p, sbgemm_q, sbgemm_r;
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
int sbgemm_align_k;
@@ -105,6 +109,14 @@ int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *);
int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);

int (*bgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 *, bfloat16 *, BLASLONG);
int (*bgemm_beta )(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);

int (*bgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*bgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*bgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*bgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);

int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);

@@ -1254,6 +1266,13 @@ extern gotoblas_t *gotoblas;
#endif

#if (BUILD_BFLOAT16==1)
#define BGEMM_P gotoblas -> bgemm_p
#define BGEMM_Q gotoblas -> bgemm_q
#define BGEMM_R gotoblas -> bgemm_r
#define BGEMM_UNROLL_M gotoblas -> bgemm_unroll_m
#define BGEMM_UNROLL_N gotoblas -> bgemm_unroll_n
#define BGEMM_UNROLL_MN gotoblas -> bgemm_unroll_mn

#define SBGEMM_P gotoblas -> sbgemm_p
#define SBGEMM_Q gotoblas -> sbgemm_q
#define SBGEMM_R gotoblas -> sbgemm_r
@@ -1395,6 +1414,17 @@ extern gotoblas_t *gotoblas;
#endif

#if (BUILD_BFLOAT16 == 1)
#define BGEMM_P BGEMM_DEFAULT_P
#define BGEMM_Q BGEMM_DEFAULT_Q
#define BGEMM_R BGEMM_DEFAULT_R
#define BGEMM_UNROLL_M BGEMM_DEFAULT_UNROLL_M
#define BGEMM_UNROLL_N BGEMM_DEFAULT_UNROLL_N
#ifdef BGEMM_DEFAULT_UNROLL_MN
#define BGEMM_UNROLL_MN BGEMM_DEFAULT_UNROLL_MN
#else
#define BGEMM_UNROLL_MN MAX((BGEMM_UNROLL_M), (BGEMM_UNROLL_N))
#endif

#define SBGEMM_P SBGEMM_DEFAULT_P
#define SBGEMM_Q SBGEMM_DEFAULT_Q
#define SBGEMM_R SBGEMM_DEFAULT_R
@@ -1555,6 +1585,18 @@ extern gotoblas_t *gotoblas;
#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R
#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M
#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N
#elif defined(BFLOAT16) && defined(BGEMM)
#define GEMM_P BGEMM_P
#define GEMM_Q BGEMM_Q
#define GEMM_R BGEMM_R
#define GEMM_UNROLL_M BGEMM_UNROLL_M
#define GEMM_UNROLL_N BGEMM_UNROLL_N
#define GEMM_UNROLL_MN BGEMM_UNROLL_MN
#define GEMM_DEFAULT_P BGEMM_DEFAULT_P
#define GEMM_DEFAULT_Q BGEMM_DEFAULT_Q
#define GEMM_DEFAULT_R BGEMM_DEFAULT_R
#define GEMM_DEFAULT_UNROLL_M BGEMM_DEFAULT_UNROLL_M
#define GEMM_DEFAULT_UNROLL_N BGEMM_DEFAULT_UNROLL_N
#elif defined(BFLOAT16)
#define GEMM_P SBGEMM_P
#define GEMM_Q SBGEMM_Q


+ 55
- 0
driver/level3/Makefile View File

@@ -1,3 +1,32 @@
###############################################################################
# Copyright (c) 2025, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###############################################################################

TOPDIR = ../..
include ../../Makefile.system

@@ -20,6 +49,7 @@ USE_GEMM3M = 1
endif

ifeq ($(BUILD_BFLOAT16),1)
BBLASOBJS += bgemm_nn.$(SUFFIX) bgemm_nt.$(SUFFIX) bgemm_tn.$(SUFFIX) bgemm_tt.$(SUFFIX)
SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX)
endif

@@ -212,6 +242,7 @@ COMMONOBJS += syrk_thread.$(SUFFIX)

ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
ifeq ($(BUILD_BFLOAT16),1)
BBLASOBJS += bgemm_thread_nn.$(SUFFIX) bgemm_thread_nt.$(SUFFIX) bgemm_thread_tn.$(SUFFIX) bgemm_thread_tt.$(SUFFIX)
SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
endif
ifeq ($(BUILD_HFLOAT16),1)
@@ -350,6 +381,18 @@ endif

all ::

bgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

bgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)

bgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)

bgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)

sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

@@ -569,6 +612,18 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h
beta_thread.$(SUFFIX) : beta_thread.c ../../common.h
$(CC) -c $(CFLAGS) $< -o $(@F)

bgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

bgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)

bgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)

bgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)

sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)



+ 32
- 0
getarch_2nd.c View File

@@ -1,3 +1,31 @@
/***************************************************************************
* Copyright (c) 2025, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/

#include <stdio.h>
#ifndef BUILD_KERNEL
#include "config.h"
@@ -17,6 +45,10 @@ typedef unsigned long BLASULONG;
int main(int argc, char **argv) {

if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
printf("BGEMM_UNROLL_M=%d\n", BGEMM_DEFAULT_UNROLL_M);
printf("BGEMM_UNROLL_N=%d\n", BGEMM_DEFAULT_UNROLL_N);
printf("BGEMM_UNROLL_M=%d\n", BGEMM_DEFAULT_UNROLL_M);
printf("BGEMM_UNROLL_N=%d\n", BGEMM_DEFAULT_UNROLL_N);
printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M);
printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N);
printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M);


+ 9
- 1
interface/Makefile View File

@@ -47,6 +47,7 @@ SBLAS3OBJS = \
sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) sgemmtr.$(SUFFIX)

ifeq ($(BUILD_BFLOAT16),1)
BBLAS3OBJ = bgemm.$(SUFFIX)
SBBLAS1OBJS = sbdot.$(SUFFIX)
SBBLAS2OBJS = sbgemv.$(SUFFIX)
SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX)
@@ -289,6 +290,7 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemmtr.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX)

ifeq ($(BUILD_BFLOAT16),1)
CBBLAS3OBJS = cblas_bgemm.$(SUFFIX)
CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX)
@@ -393,6 +395,7 @@ override CFLAGS += -I.
SBLAS1OBJS += $(CSBLAS1OBJS)
SBLAS2OBJS += $(CSBLAS2OBJS)
SBLAS3OBJS += $(CSBLAS3OBJS)
BBLAS3OBJ += $(CBBLAS3OBJS)
SBBLAS1OBJS += $(CSBBLAS1OBJS)
SBBLAS2OBJS += $(CSBBLAS2OBJS)
SBBLAS3OBJS += $(CSBBLAS3OBJS)
@@ -412,6 +415,7 @@ SBEXTOBJS += $(CSBEXTOBJS)
CBAUXOBJS += $(CXERBLAOBJ)
endif

BBLASOBJS = $(BBLAS3OBJ)
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
SHBLASOBJS = $(SHBLAS3OBJS)
@@ -560,7 +564,7 @@ level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $
level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(SHBLAS3OBJS)
level3 : $(SBBLAS3OBJS) $(BBLAS3OBJ) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(SHBLAS3OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

aux : $(CBAUXOBJS)
@@ -1311,6 +1315,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
$(CC) -c $(CFLAGS) $< -o $(@F)

ifeq ($(BUILD_BFLOAT16),1)
bgemm.$(SUFFIX) bgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
@@ -1979,6 +1985,8 @@ cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

ifeq ($(BUILD_BFLOAT16),1)
cblas_bgemm.$(SUFFIX) cblas_bgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -DBGEMM -c $(CFLAGS) $< -o $(@F)
cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif


+ 6
- 1
interface/gemm.c View File

@@ -54,8 +54,13 @@
#define ERROR_NAME "DGEMM "
#define GEMV BLASFUNC(dgemv)
#elif defined(BFLOAT16)
#ifdef BGEMM
#define ERROR_NAME "BGEMM "
#define GEMV BLASFUNC(bgemv)
#else
#define ERROR_NAME "SBGEMM "
#define GEMV BLASFUNC(sbgemv)
#endif
#elif defined(HFLOAT16)
#define ERROR_NAME "SHGEMM "
#else
@@ -579,7 +584,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
#endif

#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(HFLOAT16) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16))
#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(HFLOAT16) && (!defined(BFLOAT16) || (!defined(BGEMM) && defined(SBGEMM_GEMV_FORWARD)) || (defined(BGEMM) && defined(BGEMM_GEMV_FORWARD)))
#if defined(ARCH_ARM64)
// The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c}
// perform poorly in certain circumstances. We use the following boolean


+ 66
- 2
kernel/Makefile.L3 View File

@@ -1,3 +1,30 @@
###############################################################################
# Copyright (c) 2025, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###############################################################################
USE_GEMM3M = 0
OS := $(shell uname)

@@ -110,6 +137,23 @@ endif
endif

ifeq ($(BUILD_BFLOAT16), 1)
ifndef BGEMMKERNEL
BGEMM_BETA = ../generic/gemm_beta.c
BGEMMKERNEL = ../generic/gemmkernel_2x2.c
BGEMMINCOPY = ../generic/gemm_ncopy_2.c
BGEMMITCOPY = ../generic/gemm_tcopy_2.c
BGEMMONCOPY = ../generic/gemm_ncopy_2.c
BGEMMOTCOPY = ../generic/gemm_tcopy_2.c
BGEMMINCOPYOBJ = bgemm_incopy$(TSUFFIX).$(SUFFIX)
BGEMMITCOPYOBJ = bgemm_itcopy$(TSUFFIX).$(SUFFIX)
BGEMMONCOPYOBJ = bgemm_oncopy$(TSUFFIX).$(SUFFIX)
BGEMMOTCOPYOBJ = bgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
BKERNELOBJS += \
bgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(BGEMMINCOPYOBJ) $(BGEMMITCOPYOBJ) \
$(BGEMMONCOPYOBJ) $(BGEMMOTCOPYOBJ)

ifndef SBGEMMKERNEL
SBGEMM_BETA = ../generic/gemm_beta.c
SBGEMMKERNEL = ../generic/gemmkernel_2x2.c
@@ -210,6 +254,7 @@ XKERNELOBJS += \
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)

ifeq ($(BUILD_BFLOAT16),1)
BBLASOBJS += $(BKERNELOBJS)
SBBLASOBJS += $(SBKERNELOBJS)
endif
ifeq ($(BUILD_HFLOAT16),1)
@@ -223,6 +268,7 @@ ZBLASOBJS += $(ZKERNELOBJS)
XBLASOBJS += $(XKERNELOBJS)

ifeq ($(BUILD_BFLOAT16),1)
BBLASOBJS += bgemm_beta$(TSUFFIX).$(SUFFIX)
SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(BUILD_HFLOAT16),1)
@@ -667,6 +713,8 @@ XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))

ifeq ($(BUILD_BFLOAT16),1)
$(KDIR)bgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BGEMM_BETA)
$(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
@@ -698,9 +746,22 @@ ifeq ($(ARCH), E2K)
USE_TRMM = 1
endif


ifeq ($(BUILD_BFLOAT16), 1)

$(KDIR)$(BGEMMONCOPYOBJ) : $(KERNELDIR)/$(BGEMMONCOPY)
$(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(BGEMMOTCOPYOBJ) : $(KERNELDIR)/$(BGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@

ifneq ($(BGEMM_UNROLL_M), $(BGEMM_UNROLL_N))
$(KDIR)$(BGEMMINCOPYOBJ) : $(KERNELDIR)/$(BGEMMINCOPY)
$(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(BGEMMITCOPYOBJ) : $(KERNELDIR)/$(BGEMMITCOPY)
$(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@
endif

$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@

@@ -914,7 +975,8 @@ endif
endif

ifeq ($(BUILD_BFLOAT16), 1)

$(KDIR)bgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BGEMMKERNEL)
$(CC) $(CFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif
@@ -2908,6 +2970,8 @@ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

ifeq ($(BUILD_BFLOAT16),1)
$(KDIR)bgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(BGEMM_BETA)
$(CC) $(PFLAGS) -c -DBFLOAT16 -DBGEMM -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif


+ 50
- 19
kernel/generic/gemm_beta.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -38,11 +39,41 @@

#include "common.h"

#if defined(BFLOAT16) && defined(BGEMM) && defined(BFLOAT16CONVERSION)
static float
bfloat16tof32 (bfloat16 f16)
{
float result = 0;
unsigned short* q = (unsigned short*)(&result);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
q[0] = f16;
#else
q[1] = f16;
#endif
return result;
}
static bfloat16
f32tobfloat16(float f32)
{
unsigned short* q = (unsigned short*)(&f32);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return q[0];
#else
return q[1];
#endif
}

#define BF16TOF32(x) (bfloat16tof32(x))
#define F32TOBF16(x) (f32tobfloat16(x))
#else
#define BF16TOF32(x) x
#define F32TOBF16(x) x
#endif

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc){


BLASLONG i, j;
BLASLONG chunk, remain;
FLOAT *c_offset1, *c_offset;
@@ -54,18 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset1 = c_offset;
c_offset += ldc;
for(i=chunk; i>0; i--){
*(c_offset1 + 0) = ZERO;
*(c_offset1 + 1) = ZERO;
*(c_offset1 + 2) = ZERO;
*(c_offset1 + 3) = ZERO;
*(c_offset1 + 4) = ZERO;
*(c_offset1 + 5) = ZERO;
*(c_offset1 + 6) = ZERO;
*(c_offset1 + 7) = ZERO;
*(c_offset1 + 0) = F32TOBF16(ZERO);
*(c_offset1 + 1) = F32TOBF16(ZERO);
*(c_offset1 + 2) = F32TOBF16(ZERO);
*(c_offset1 + 3) = F32TOBF16(ZERO);
*(c_offset1 + 4) = F32TOBF16(ZERO);
*(c_offset1 + 5) = F32TOBF16(ZERO);
*(c_offset1 + 6) = F32TOBF16(ZERO);
*(c_offset1 + 7) = F32TOBF16(ZERO);
c_offset1 += 8;
}
for(i=remain; i>0; i--){
*c_offset1 = ZERO;
*c_offset1 = F32TOBF16(ZERO);
c_offset1 ++;
}
}
@@ -74,18 +105,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset1 = c_offset;
c_offset += ldc;
for(i=chunk; i>0; i--){
*(c_offset1 + 0) *= beta;
*(c_offset1 + 1) *= beta;
*(c_offset1 + 2) *= beta;
*(c_offset1 + 3) *= beta;
*(c_offset1 + 4) *= beta;
*(c_offset1 + 5) *= beta;
*(c_offset1 + 6) *= beta;
*(c_offset1 + 7) *= beta;
*(c_offset1 + 0) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[0]));
*(c_offset1 + 1) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[1]));
*(c_offset1 + 2) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[2]));
*(c_offset1 + 3) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[3]));
*(c_offset1 + 4) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[4]));
*(c_offset1 + 5) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[5]));
*(c_offset1 + 6) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[6]));
*(c_offset1 + 7) = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[7]));
c_offset1 += 8;
}
for(i=remain; i>0; i--){
*c_offset1 *= beta;
*c_offset1 = F32TOBF16(BF16TOF32(beta) * BF16TOF32(c_offset1[0]));
c_offset1 ++;
}
}


+ 42
- 18
kernel/generic/gemmkernel_2x2.c View File

@@ -12,9 +12,29 @@ bfloat16tof32 (bfloat16 f16)
#endif
return result;
}

static bfloat16 f32tobfloat16(float f32) {
unsigned short *q = (unsigned short *)(&f32);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return q[0];
#else
return q[1];
#endif
}

#ifdef BGEMM
#define ALPHA bfloat16tof32(alpha)
#define BF16TOF32(x) (bfloat16tof32(x))
#define F32TOBF16(x) (f32tobfloat16(x))
#else
#define ALPHA alpha
#define BF16TOF32(x) (bfloat16tof32(x))
#define F32TOBF16(x) x
#endif
#else
#define ALPHA alpha
#define BF16TOF32(x) x
#define F32TOBF16(x) x
#endif
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc
#ifdef TRMMKERNEL
@@ -25,7 +45,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,
BLASLONG i,j,k;
FLOAT *C0,*C1;
IFLOAT *ptrba,*ptrbb;
#ifdef BGEMM
float res0,res1,res2,res3;
#else
FLOAT res0,res1,res2,res3;
#endif
IFLOAT load0,load1,load2,load3,load4,load5,load6,load7;
for (j=0; j<bn/2; j+=1)
{
@@ -89,14 +113,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
res2 = res2*alpha;
C1[0] = C1[0]+res2;
res3 = res3*alpha;
C1[1] = C1[1]+res3;
res0 = res0*ALPHA;
C0[0] = F32TOBF16(BF16TOF32(C0[0])+res0);
res1 = res1*ALPHA;
C0[1] = F32TOBF16(BF16TOF32(C0[1])+res1);
res2 = res2*ALPHA;
C1[0] = F32TOBF16(BF16TOF32(C1[0])+res2);
res3 = res3*ALPHA;
C1[1] = F32TOBF16(BF16TOF32(C1[1])+res3);
C0 = C0+2;
C1 = C1+2;
}
@@ -115,10 +139,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C1[0] = C1[0]+res1;
res0 = res0*ALPHA;
C0[0] = F32TOBF16(BF16TOF32(C0[0])+res0);
res1 = res1*ALPHA;
C1[0] = F32TOBF16(BF16TOF32(C1[0])+res1);
C0 = C0+1;
C1 = C1+1;
}
@@ -146,10 +170,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res1 = res1*alpha;
C0[1] = C0[1]+res1;
res0 = res0*ALPHA;
C0[0] = F32TOBF16(BF16TOF32(C0[0])+res0);
res1 = res1*ALPHA;
C0[1] = F32TOBF16(BF16TOF32(C0[1])+res1);
C0 = C0+2;
}
for (i=0; i<(bm&1); i+=1)
@@ -164,8 +188,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0 = res0*alpha;
C0[0] = C0[0]+res0;
res0 = res0*ALPHA;
C0[0] = F32TOBF16(BF16TOF32(C0[0])+res0);
C0 = C0+1;
}
k = (bk<<0);


+ 14
- 1
kernel/setparam-ref.c View File

@@ -1,6 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* Copyright 2023, 2025 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@@ -57,6 +57,15 @@ gotoblas_t TABLE_NAME = {
GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,

#ifdef BUILD_BFLOAT16
0, 0, 0,
BGEMM_DEFAULT_UNROLL_M, BGEMM_DEFAULT_UNROLL_N,
#ifdef BGEMM_DEFAULT_UNROLL_MN
BGEMM_DEFAULT_UNROLL_MN,
#else
MAX(BGEMM_DEFAULT_UNROLL_M, BGEMM_DEFAULT_UNROLL_N),
#endif
BGEMM_ALIGN_K,

0, 0, 0,
SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N,
#ifdef SBGEMM_DEFAULT_UNROLL_MN
@@ -78,6 +87,10 @@ gotoblas_t TABLE_NAME = {
sbgemv_nTS, sbgemv_tTS, sger_kTS,
ssymv_LTS, ssymv_UTS,

bgemm_kernelTS, bgemm_betaTS,
bgemm_incopyTS, bgemm_itcopyTS,
bgemm_oncopyTS, bgemm_otcopyTS,

sbgemm_kernelTS, sbgemm_betaTS,
#if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N
sbgemm_incopyTS, sbgemm_itcopyTS,


+ 9
- 1
param.h View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2023, The OpenBLAS Project
Copyright (c) 2011-2023, 2025 The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
@@ -79,6 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SHGEMM_DEFAULT_R 240
#define SHGEMM_DEFAULT_Q 12288

#define BGEMM_DEFAULT_UNROLL_N 4
#define BGEMM_DEFAULT_UNROLL_M 8
#define BGEMM_DEFAULT_UNROLL_MN 32
#define BGEMM_DEFAULT_P 256
#define BGEMM_DEFAULT_R 256
#define BGEMM_DEFAULT_Q 256
#define BGEMM_ALIGN_K 1 // must be 2^x

#define SBGEMM_DEFAULT_UNROLL_N 4
#define SBGEMM_DEFAULT_UNROLL_M 8
#define SBGEMM_DEFAULT_UNROLL_MN 32


+ 40
- 2
test/Makefile View File

@@ -1,3 +1,31 @@
###############################################################################
# Copyright (c) 2025, The OpenBLAS Project
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. Neither the name of the OpenBLAS project nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
###############################################################################
TOPDIR = ..
include ../Makefile.system
ifeq ($(F_COMPILER),GFORTRAN)
@@ -165,6 +193,7 @@ endif
endif

ifeq ($(BUILD_BFLOAT16),1)
BF3= test_bgemm
B3= test_sbgemm
endif
ifeq ($(BUILD_SINGLE),1)
@@ -192,7 +221,7 @@ endif
ifeq ($(SUPPORT_GEMM3M),1)
level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m
else
level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
level3: $(BF3) $(B3) $(S3) $(D3) $(C3) $(Z3)
endif

ifneq ($(CROSS), 1)
@@ -200,6 +229,8 @@ ifneq ($(CROSS), 1)
ifeq ($(BUILD_BFLOAT16),1)
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM
@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_bgemm > BBLAT3.SUMM
@$(GREP) -q FATAL BBLAT3.SUMM && cat BBLAT3.SUMM || exit 0
endif
ifeq ($(BUILD_SINGLE),1)
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
@@ -223,6 +254,8 @@ ifeq ($(USE_OPENMP), 1)
ifeq ($(BUILD_BFLOAT16),1)
OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM
@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
OMP_NUM_THREADS=2 ./test_bgemm > BBLAT3.SUMM
@$(GREP) -q FATAL BBLAT3.SUMM && cat BBLAT3.SUMM || exit 0
endif
ifeq ($(BUILD_SINGLE),1)
OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
@@ -244,6 +277,8 @@ else
ifeq ($(BUILD_BFLOAT16),1)
OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM
@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
OPENBLAS_NUM_THREADS=2 ./test_bgemm > BBLAT3.SUMM
@$(GREP) -q FATAL BBLAT3.SUMM && cat BBLAT3.SUMM || exit 0
endif
ifeq ($(BUILD_SINGLE),1)
OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
@@ -367,6 +402,9 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME)
endif

ifeq ($(BUILD_BFLOAT16),1)
test_bgemm : compare_sgemm_bgemm.c ../$(LIBNAME)
$(CC) $(CLDFLAGS) -o test_bgemm compare_sgemm_bgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)

test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
$(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
endif
@@ -387,7 +425,7 @@ clean:
@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
sblat1 dblat1 cblat1 zblat1 \
sblat2 dblat2 cblat2 zblat2 \
test_sbgemm sblat3 dblat3 cblat3 zblat3 \
test_bgemm test_sbgemm sblat3 dblat3 cblat3 zblat3 \
sblat1p dblat1p cblat1p zblat1p \
sblat2p dblat2p cblat2p zblat2p \
sblat3p dblat3p cblat3p zblat3p \


+ 224
- 0
test/compare_sgemm_bgemm.c View File

@@ -0,0 +1,224 @@
/***************************************************************************
Copyright (c) 2025 The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "../common.h"
#include <stdint.h>
#include <stdio.h>

#include <arm_neon.h>

#define SGEMM BLASFUNC(sgemm)
#define BGEMM BLASFUNC(bgemm)
#define BGEMM_LARGEST 256

typedef union
{
unsigned short v;
#if defined(_AIX)
struct __attribute__((packed))
#else
struct
#endif
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
unsigned short s:1;
unsigned short e:8;
unsigned short m:7;
#else
unsigned short m:7;
unsigned short e:8;
unsigned short s:1;
#endif
} bits;
} bfloat16_bits;

typedef union
{
float v;
#if defined(_AIX)
struct __attribute__((packed))
#else
struct
#endif
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
uint32_t s:1;
uint32_t e:8;
uint32_t m:23;
#else
uint32_t m:23;
uint32_t e:8;
uint32_t s:1;
#endif
} bits;
} float32_bits;

float
float16to32 (bfloat16_bits f16)
{
float32_bits f32;
f32.bits.s = f16.bits.s;
f32.bits.e = f16.bits.e;
f32.bits.m = (uint32_t) f16.bits.m << 16;
return f32.v;
}

bfloat16
float32to16 (float32_bits f32)
{
bfloat16_bits f16;
f16.bits.s = f32.bits.s;
f16.bits.e = f32.bits.e;
f16.bits.m = (f32.bits.m >> 16) & 0x7f;
return f16.v;
}

static float truncate_float(float value) {
bfloat16_bits f16 = (bfloat16_bits)float32to16((float32_bits)value);
return float16to32(f16);
}

void *malloc_safe(size_t size) {
if (size == 0)
return malloc(1);
else
return malloc(size);
}

int
main (int argc, char *argv[])
{
blasint m, n, k;
int i, j, l;
blasint x, y;
int ret = 0;
int loop = BGEMM_LARGEST;
char transA = 'N', transB = 'N';
float alpha = 1.0, beta = 0.0;
bfloat16 alpha_bf16 = float32to16((float32_bits)alpha);
bfloat16 beta_bf16 = float32to16((float32_bits)beta);

for (x = 0; x <= loop; x++)
{
if ((x > 100) && (x != BGEMM_LARGEST)) continue;
m = k = n = x;
float *A = (float *)malloc_safe(m * k * sizeof(FLOAT));
float *B = (float *)malloc_safe(k * n * sizeof(FLOAT));
float *C = (float *)malloc_safe(m * n * sizeof(FLOAT));
bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(m * k * sizeof(bfloat16_bits));
bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(k * n * sizeof(bfloat16_bits));
bfloat16_bits *CC = (bfloat16_bits *)malloc_safe(k * n * sizeof(bfloat16_bits));
FLOAT *DD = (FLOAT *)malloc_safe(m * n * sizeof(FLOAT));
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
(DD == NULL) || (CC == NULL))
return 1;
bfloat16 atmp,btmp;
blasint one=1;

for (j = 0; j < m; j++)
{
for (i = 0; i < k; i++)
{
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one);
AA[j * k + i].v = atmp;
}
}
for (j = 0; j < n; j++)
{
for (i = 0; i < k; i++)
{
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one);
BB[j * k + i].v = btmp;
}
}
for (y = 0; y < 4; y++)
{
if ((y == 0) || (y == 2)) {
transA = 'N';
} else {
transA = 'T';
}
if ((y == 0) || (y == 1)) {
transB = 'N';
} else {
transB = 'T';
}

memset(CC, 0, m * n * sizeof(bfloat16_bits));
memset(DD, 0, m * n * sizeof(FLOAT));
memset(C, 0, m * n * sizeof(FLOAT));

SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
&m, B, &k, &beta, C, &m);
BGEMM (&transA, &transB, &m, &n, &k, &alpha_bf16, (bfloat16*) AA,
&m, (bfloat16*)BB, &k, &beta_bf16, (bfloat16*)CC, &m);

for (i = 0; i < n; i++)
for (j = 0; j < m; j++)
{
for (l = 0; l < k; l++)
if (transA == 'N' && transB == 'N')
{
DD[i * m + j] +=
float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]);
} else if (transA == 'T' && transB == 'N')
{
DD[i * m + j] +=
float16to32 (AA[k * j + l]) * float16to32 (BB[l + k * i]);
} else if (transA == 'N' && transB == 'T')
{
DD[i * m + j] +=
float16to32 (AA[l * m + j]) * float16to32 (BB[i + l * n]);
} else if (transA == 'T' && transB == 'T')
{
DD[i * m + j] +=
float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]);
}
if (fabs(float16to32(CC[i * m + j]) - truncate_float(C[i * m + j])) > 2.0) {
ret++;
}
if (fabs(float16to32(CC[i * m + j]) - truncate_float(DD[i * m + j])) > 1.0) {
ret++;
}
}
}
free(A);
free(B);
free(C);
free(AA);
free(BB);
free(CC);
free(DD);
}

if (ret != 0) {
fprintf (stderr, "FATAL ERROR BGEMM - Return code: %d\n", ret);
return ret;
}
}

Loading…
Cancel
Save