ARM64: Enable bfloat16 kernels by defaultpull/5406/head
@@ -87,10 +87,16 @@ jobs: | |||||
echo "max_size = 300M" > ~/.ccache/ccache.conf | echo "max_size = 300M" > ~/.ccache/ccache.conf | ||||
echo "compression = true" >> ~/.ccache/ccache.conf | echo "compression = true" >> ~/.ccache/ccache.conf | ||||
ccache -s | ccache -s | ||||
- name: Add gfortran runtime to link path | |||||
if: matrix.build == 'make' && runner.os == 'macOS' | |||||
run: | | |||||
GFORTRAN_LIBDIR=$(gfortran -print-file-name=libgfortran.dylib | xargs dirname) | |||||
echo "Using gfortran runtime in $GFORTRAN_LIBDIR" | |||||
echo "LDFLAGS=-L/opt/homebrew/opt/llvm/lib -L$GFORTRAN_LIBDIR" >> $GITHUB_ENV | |||||
- name: Build OpenBLAS | - name: Build OpenBLAS | ||||
run: | | run: | | ||||
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||||
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | ||||
export CC="/opt/homebrew/opt/llvm/bin/clang" | export CC="/opt/homebrew/opt/llvm/bin/clang" | ||||
case "${{ matrix.build }}" in | case "${{ matrix.build }}" in | ||||
@@ -89,6 +89,14 @@ jobs: | |||||
echo "max_size = 300M" > ~/.ccache/ccache.conf | echo "max_size = 300M" > ~/.ccache/ccache.conf | ||||
echo "compression = true" >> ~/.ccache/ccache.conf | echo "compression = true" >> ~/.ccache/ccache.conf | ||||
ccache -s | ccache -s | ||||
- name: Add gfortran runtime to link path | |||||
if: matrix.build == 'make' && runner.os == 'macOS' | |||||
run: | | |||||
GFORTRAN_LIBDIR=$(gfortran -print-file-name=libgfortran.dylib | xargs dirname) | |||||
echo "Using gfortran runtime in $GFORTRAN_LIBDIR" | |||||
# Preserve whatever LDFLAGS may already contain | |||||
echo "LDFLAGS=${LDFLAGS:+$LDFLAGS }-L$GFORTRAN_LIBDIR" >> "$GITHUB_ENV" | |||||
- name: Build OpenBLAS | - name: Build OpenBLAS | ||||
run: | | run: | | ||||
@@ -255,6 +255,7 @@ In chronological order: | |||||
* Abhishek Kumar <https://github.com/abhishek-iitmadras> | * Abhishek Kumar <https://github.com/abhishek-iitmadras> | ||||
* [2025-04-22] Optimise dot kernel for NEOVERSE V1 | * [2025-04-22] Optimise dot kernel for NEOVERSE V1 | ||||
* [2025-07-23] ARM64-Enable bfloat16 kernels by default | |||||
* Sharif Inamdar <sharif.inamdar@arm.com> | * Sharif Inamdar <sharif.inamdar@arm.com> | ||||
* [2025-06-05] Optimize gemv_n_sve_v1x3 kernel | * [2025-06-05] Optimize gemv_n_sve_v1x3 kernel | ||||
@@ -270,6 +270,7 @@ SMALL_MATRIX_OPT = 1 | |||||
BUILD_BFLOAT16 = 1 | BUILD_BFLOAT16 = 1 | ||||
else ifeq ($(ARCH), arm64) | else ifeq ($(ARCH), arm64) | ||||
SMALL_MATRIX_OPT = 1 | SMALL_MATRIX_OPT = 1 | ||||
BUILD_BFLOAT16 = 1 | |||||
endif | endif | ||||
ifeq ($(ARCH), loongarch64) | ifeq ($(ARCH), loongarch64) | ||||
SMALL_MATRIX_OPT = 1 | SMALL_MATRIX_OPT = 1 | ||||
@@ -425,10 +426,8 @@ ifeq ($(OSNAME), Darwin) | |||||
ifndef MACOSX_DEPLOYMENT_TARGET | ifndef MACOSX_DEPLOYMENT_TARGET | ||||
ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
export MACOSX_DEPLOYMENT_TARGET=11.0 | export MACOSX_DEPLOYMENT_TARGET=11.0 | ||||
ifeq ($(C_COMPILER), GCC) | |||||
export NO_SVE = 1 | export NO_SVE = 1 | ||||
export NO_SME = 1 | export NO_SME = 1 | ||||
endif | |||||
else | else | ||||
export MACOSX_DEPLOYMENT_TARGET=10.8 | export MACOSX_DEPLOYMENT_TARGET=10.8 | ||||
endif | endif | ||||
@@ -27,6 +27,7 @@ | |||||
* *****************************************************************************/ | * *****************************************************************************/ | ||||
#include <arm_sve.h> | #include <arm_sve.h> | ||||
#include <arm_neon.h> | |||||
#include "common.h" | #include "common.h" | ||||
@@ -27,6 +27,7 @@ | |||||
* *****************************************************************************/ | * *****************************************************************************/ | ||||
#include <arm_sve.h> | #include <arm_sve.h> | ||||
#include <arm_neon.h> | |||||
#include "common.h" | #include "common.h" | ||||
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#include "common.h" | #include "common.h" | ||||
#include <arm_sve.h> | #include <arm_sve.h> | ||||
#include <arm_neon.h> | |||||
#define UPDATE_PTRSx2 \ | #define UPDATE_PTRSx2 \ | ||||
a_ptr1 = a_ptr0 + lda; | a_ptr1 = a_ptr0 + lda; | ||||