Browse Source

Merge pull request #5362 from Mousius/fix-bf16

Fix SBGEMM BFLOAT16 build
pull/5364/head
Martin Kroeker GitHub 2 months ago
parent
commit
3d31887073
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
8 changed files with 16 additions and 13 deletions
  1. +2
    -1
      .github/workflows/arm64_graviton.yml
  2. +3
    -3
      cmake/cc.cmake
  3. +2
    -2
      cmake/system.cmake
  4. +2
    -1
      driver/level3/level3.c
  5. +2
    -2
      driver/level3/level3_thread.c
  6. +2
    -2
      getarch.c
  7. +1
    -1
      lapack/CMakeLists.txt
  8. +2
    -1
      lapack/potrf/potrf_parallel.c

+ 2
- 1
.github/workflows/arm64_graviton.yml View File

@@ -88,13 +88,14 @@ jobs:
run: | run: |
case "${{ matrix.build }}" in case "${{ matrix.build }}" in
"make") "make")
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
make -j$(nproc) DYNAMIC_ARCH=1 BUILD_BFLOAT16=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
;; ;;
"cmake") "cmake")
mkdir build && cd build mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \ cmake -DDYNAMIC_ARCH=1 \
-DNOFORTRAN=0 \ -DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \ -DBUILD_WITHOUT_LAPACK=0 \
-DBUILD_BFLOAT16=1 \
-DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \ -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \


+ 3
- 3
cmake/cc.cmake View File

@@ -211,14 +211,14 @@ endif ()
if (${CORE} STREQUAL NEOVERSEV1) if (${CORE} STREQUAL NEOVERSEV1)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1")
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1")
else () else ()
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1")
else () else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve+bf16")
endif() endif()
endif() endif()
endif () endif ()


+ 2
- 2
cmake/system.cmake View File

@@ -291,10 +291,10 @@ if (DEFINED TARGET)


if (${TARGET} STREQUAL NEOVERSEV1) if (${TARGET} STREQUAL NEOVERSEV1)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1")
else () else ()
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1")
else () else ()
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.") message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.")
endif() endif()


+ 2
- 1
driver/level3/level3.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@@ -305,7 +306,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} }


BLASLONG pad_min_l = min_l; BLASLONG pad_min_l = min_l;
#if defined(HALF)
#if defined(BFLOAT16)
#if defined(DYNAMIC_ARCH) #if defined(DYNAMIC_ARCH)
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
#else #else


+ 2
- 2
driver/level3/level3_thread.c View File

@@ -1,6 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* Copyright 2023, 2025 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@@ -324,7 +324,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
BLASLONG pad_min_l = min_l; BLASLONG pad_min_l = min_l;


#if defined(HALF)
#if defined(BFLOAT16)
#if defined(DYNAMIC_ARCH) #if defined(DYNAMIC_ARCH)
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
#else #else


+ 2
- 2
getarch.c View File

@@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
Copyright (c) 2011-2014, 2025 The OpenBLAS Project
All rights reserved. All rights reserved.


Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -1476,7 +1476,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
"-march=armv8.4-a+sve -mtune=neoverse-v1"
"-march=armv8.4-a+sve+bf16 -mtune=neoverse-v1"
#define LIBNAME "neoversev1" #define LIBNAME "neoversev1"
#define CORENAME "NEOVERSEV1" #define CORENAME "NEOVERSEV1"
#endif #endif


+ 1
- 1
lapack/CMakeLists.txt View File

@@ -52,7 +52,7 @@ GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" fa
GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3)


foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
if (${float_type} STREQUAL "HALF")
if (${float_type} STREQUAL "BFLOAT16")
continue() continue()
endif() endif()
GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type}) GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type})


+ 2
- 1
lapack/potrf/potrf_parallel.c View File

@@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2025 The OpenBLAS Project. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@@ -405,7 +406,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
#elif defined(DOUBLE) #elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL; mode = BLAS_DOUBLE | BLAS_REAL;
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#elif defined(HALF)
#elif defined(BFLOAT16)
mode = BLAS_HALF | BLAS_REAL; mode = BLAS_HALF | BLAS_REAL;
mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1; mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1;
#else #else


Loading…
Cancel
Save