@@ -58,8 +58,8 @@ task: | |||
- export VALID_ARCHS="i386 x86_64" | |||
- xcrun --sdk macosx --show-sdk-path | |||
- xcodebuild -version | |||
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" | |||
- export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" | |||
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
always: | |||
config_artifacts: | |||
@@ -78,8 +78,8 @@ task: | |||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
- export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
- xcrun --sdk iphoneos --show-sdk-path | |||
- ls -l /Applications | |||
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | |||
@@ -127,7 +127,7 @@ task: | |||
FreeBSD_task: | |||
name: FreeBSD-gcc | |||
freebsd_instance: | |||
image_family: freebsd-14-1 | |||
image_family: freebsd-14-2 | |||
install_script: | |||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
compile_script: | |||
@@ -138,7 +138,7 @@ FreeBSD_task: | |||
FreeBSD_task: | |||
name: freebsd-gcc-ilp64 | |||
freebsd_instance: | |||
image_family: freebsd-14-1 | |||
image_family: freebsd-14-2 | |||
install_script: | |||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
compile_script: | |||
@@ -148,7 +148,7 @@ FreeBSD_task: | |||
FreeBSD_task: | |||
name: FreeBSD-clang-openmp | |||
freebsd_instance: | |||
image_family: freebsd-14-1 | |||
image_family: freebsd-14-2 | |||
install_script: | |||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc | |||
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so | |||
@@ -102,6 +102,7 @@ jobs: | |||
mkdir build && cd build | |||
cmake -DDYNAMIC_ARCH=1 \ | |||
-DUSE_OPENMP=${{matrix.openmp}} \ | |||
-DOpenMP_Fortran_LIB_NAMES=omp \ | |||
-DINTERFACE64=${{matrix.ilp64}} \ | |||
-DNOFORTRAN=0 \ | |||
-DBUILD_WITHOUT_LAPACK=0 \ | |||
@@ -31,7 +31,7 @@ jobs: | |||
steps: | |||
- name: Checkout repository | |||
uses: actions/checkout@v3 | |||
uses: actions/checkout@v4 | |||
- name: install build deps | |||
run: | | |||
@@ -40,18 +40,18 @@ jobs: | |||
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev | |||
- name: checkout qemu | |||
uses: actions/checkout@v3 | |||
uses: actions/checkout@v4 | |||
with: | |||
repository: T-head-Semi/qemu | |||
repository: XUANTIE-RV/qemu | |||
path: qemu | |||
ref: 1e692ebb43d396c52352406323fc782c1ac99a42 | |||
ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 | |||
- name: build qemu | |||
run: | | |||
# Force use c910v qemu-user | |||
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
cd qemu | |||
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" | |||
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | |||
make -j$(nproc) | |||
@@ -83,9 +83,39 @@ jobs: | |||
- name: test | |||
run: | | |||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH | |||
qemu-riscv64 ./utest/openblas_utest | |||
qemu-riscv64 ./utest/openblas_utest_ext | |||
run_with_retry() { | |||
local cmd="$1" | |||
local time_out=10 | |||
local retries=10 | |||
local attempt=0 | |||
for ((i=1; i<=retries; i++)); do | |||
attempt=$((i)) | |||
if timeout -s 12 --preserve-status $time_out $cmd; then | |||
echo "Command succeeded on attempt $i." | |||
return 0 | |||
else | |||
local exit_code=$? | |||
if [ $exit_code -eq 140 ]; then | |||
echo "Attempt $i timed out (retrying...)" | |||
time_out=$((time_out + 5)) | |||
else | |||
echo "Attempt $i failed with exit code $exit_code. Aborting workflow." | |||
exit $exit_code | |||
fi | |||
fi | |||
done | |||
echo "All $retries attempts failed, giving up." | |||
echo "Final failure was due to timeout." | |||
echo "Aborting workflow." | |||
exit $exit_code | |||
} | |||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH | |||
which qemu-riscv64 | |||
export QEMU_BIN=$(which qemu-riscv64) | |||
run_with_retry "$QEMU_BIN ./utest/openblas_utest" | |||
run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext" | |||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 | |||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 | |||
@@ -356,3 +356,23 @@ jobs: | |||
- name: Build OpenBLAS | |||
run: | | |||
make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} | |||
neoverse_build: | |||
if: "github.repository == 'OpenMathLib/OpenBLAS'" | |||
runs-on: ubuntu-24.04-arm | |||
steps: | |||
- name: Checkout repository | |||
uses: actions/checkout@v3 | |||
- name: Install Dependencies | |||
run: | | |||
sudo apt-get update | |||
sudo apt-get install -y gcc gfortran make | |||
- name: Build OpenBLAS | |||
run: | | |||
make -j${nproc} | |||
make -j${nproc} lapack-test | |||
@@ -59,7 +59,8 @@ jobs: | |||
- name: Compilation cache | |||
uses: actions/cache@v3 | |||
with: path: ~/.ccache | |||
with: | |||
path: ~/.ccache | |||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
restore-keys: | | |||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
@@ -21,6 +21,8 @@ include(CMakePackageConfigHelpers) | |||
####### | |||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) | |||
option(BUILD_WITHOUT_LAPACKE "Do not build the C interface to LAPACK)" OFF) | |||
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) | |||
set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") | |||
@@ -60,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th | |||
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
option(BUILD_STATIC_LIBS "Build static library" OFF) | |||
option(BUILD_SHARED_LIBS "Build shared library" OFF) | |||
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | |||
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
endif() | |||
@@ -75,12 +78,27 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in | |||
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) | |||
set (DELETE_STATIC_LIBS "") | |||
if (NOT BUILD_STATIC_LIBS) | |||
message (STATUS "forcing build of a temporary static library for symbol renaming") | |||
set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) | |||
set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
set (DELETE_STATIC_LIBS file (REMOVE $<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.lib)) | |||
endif () | |||
endif() | |||
####### | |||
if(BUILD_WITHOUT_LAPACK) | |||
set(NO_LAPACK 1) | |||
set(NO_LAPACKE 1) | |||
endif() | |||
if (BUILD_WITHOUT_LAPACKE) | |||
set(NO_LAPACKE 1) | |||
endif() | |||
if(BUILD_WITHOUT_CBLAS) | |||
set(NO_CBLAS 1) | |||
endif() | |||
@@ -103,14 +121,15 @@ endif() | |||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
if (USE_OPENMP) | |||
find_package(OpenMP REQUIRED) | |||
endif () | |||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64) | |||
if (${HAVE64} GREATER -1) | |||
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}) | |||
else () | |||
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
endif () | |||
set(BLASDIRS interface driver/level2 driver/level3 driver/others) | |||
@@ -224,6 +243,12 @@ endif () | |||
# add objects to the openblas lib | |||
if(NOT NO_LAPACK) | |||
add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) | |||
if (USE_OPENMP AND (NOT NOFORTRAN)) | |||
# Disable OpenMP for LAPACK Fortran codes on Windows. | |||
if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran) | |||
endif() | |||
endif() | |||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>") | |||
endif() | |||
if(NOT NO_LAPACKE) | |||
@@ -265,30 +290,59 @@ endif() | |||
if (USE_OPENMP) | |||
if(BUILD_STATIC_LIBS) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||
if(NOFORTRAN) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) | |||
else() | |||
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||
endif() | |||
endif() | |||
if(BUILD_SHARED_LIBS) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||
if(NOFORTRAN) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) | |||
else() | |||
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) | |||
endif() | |||
endif() | |||
endif() | |||
# Seems that this hack doesn't required since macOS 11 Big Sur | |||
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | |||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
if (NOT NOFORTRAN) | |||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
else () | |||
set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
endif () | |||
# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on | |||
if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) | |||
# Use response files | |||
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
# Always build static library first | |||
if(BUILD_STATIC_LIBS) | |||
set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") | |||
else() | |||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") | |||
endif() | |||
set(CREATE_STATIC_LIBRARY_COMMAND | |||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " | |||
"sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") | |||
if(BUILD_SHARED_LIBS) | |||
add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) | |||
set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") | |||
endif() | |||
if(USE_OPENMP) | |||
get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
else() | |||
set(OMP_LIB "") | |||
endif() | |||
if(NOT NOFORTRAN) | |||
set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
if(BUILD_SHARED_LIBS) | |||
set(CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") | |||
endif() | |||
else() | |||
set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
if(BUILD_SHARED_LIBS) | |||
set(CMAKE_C_CREATE_SHARED_LIBRARY | |||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") | |||
endif() | |||
endif() | |||
endif() | |||
# Handle MSVC exports | |||
@@ -373,7 +427,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
endif() | |||
endif() | |||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
if (NOT DEFINED ARCH) | |||
set(ARCH_IN "x86_64") | |||
else() | |||
@@ -461,10 +515,33 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
else () | |||
set (BZ 0) | |||
endif() | |||
if (CMAKE_SYSTEM_NAME MATCHES "Windows") | |||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
if (CMAKE_BUILD_TYPE MATCHES "Debug") | |||
set (CRTLIB msvcrtd) | |||
set (PDBOPT -debug -pdb:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.pdb) | |||
set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
else () | |||
set (CRTLIB msvcrt) | |||
set (PDBOPT "") | |||
endif() | |||
#if (USE_PERL) | |||
message(STATUS "adding postbuild instruction to rename syms") | |||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD | |||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def | |||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c | |||
COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} | |||
#COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -wholearchive:$<TARGET_FILE:${OpenBLAS_LIBNAME}_static> -dll -out:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll -implib:$<TARGET_FILE_DIR:${OpenBLAS_LIBNAME}_static>/${OpenBLAS_LIBNAME}.dll.a | |||
${REMOVE_STATIC_LIB} VERBATIM | |||
) | |||
#endif () | |||
else () | |||
if (NOT USE_PERL) | |||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so | |||
COMMENT "renaming symbols" | |||
) | |||
else() | |||
@@ -475,6 +552,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
) | |||
endif() | |||
endif() | |||
endif() | |||
if (BUILD_BENCHMARKS) | |||
#find_package(OpenMP REQUIRED) | |||
@@ -645,3 +723,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
install(EXPORT "${PN}${SUFFIX64}Targets" | |||
NAMESPACE "${PN}${SUFFIX64}::" | |||
DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
@@ -26,6 +26,9 @@ | |||
* Chris Sidebottom <chris.sidebottom@arm.com> | |||
* Optimizations and other improvements targeting AArch64 | |||
* Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||
* Optimizations and other improvements targeting AArch64 | |||
## Previous Developers | |||
* Zaheer Chothia <zaheer.chothia@gmail.com> | |||
@@ -231,4 +234,23 @@ In chronological order: | |||
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems | |||
* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | |||
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||
* Annop Wongwathanarat <annop.wongwathanarat@arm.com> | |||
* [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 | |||
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel | |||
* [2025-02-26] Add sbgemv_t_bfdot kernel | |||
* [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 | |||
* [2025-03-12] Optimize aarch64 sgemm_ncopy | |||
* Marek Michalowski <marek.michalowski@arm.com> | |||
* [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` | |||
* [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` | |||
* [2025-02-19] Add thread throttling profile for SGEMV on `NEOVERSEV2` | |||
* Ye Tao <ye.tao@arm.com> | |||
* [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 | |||
* [2025-02-27] Add sbgemv_n_neon kernel | |||
* Abhishek Kumar <https://github.com/abhishek-iitmadras> | |||
* [2025-04-22] Optimise dot kernel for NEOVERSE V1 |
@@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | |||
fi | |||
endif | |||
ifeq ($(OSNAME), WINNT) | |||
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
endif | |||
ifneq ($(OSNAME), AIX) | |||
@echo -n " Library Name ... $(LIBNAME)" | |||
else | |||
@@ -447,7 +452,7 @@ endif | |||
@rm -f cblas.tmp cblas.tmp2 | |||
@touch $(NETLIB_LAPACK_DIR)/make.inc | |||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
@rm -f $(NETLIB_LAPACK_DIR)/make.inc | |||
@$(MAKE) -C relapack clean | |||
@rm -f *.grd Makefile.conf_last config_last.h | |||
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
@@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve | |||
endif | |||
endif | |||
ifeq ($(CORE), ARMV9SME) | |||
CCOMMON_OPT += -march=armv9-a+sve2+sme | |||
FCOMMON_OPT += -march=armv9-a+sve2 | |||
endif | |||
ifeq ($(CORE), CORTEXA53) | |||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
ifneq ($(F_COMPILER), NAG) | |||
@@ -101,7 +106,7 @@ ifeq ($(CORE), NEOVERSEV1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
CCOMMON_OPT += -march=armv8.4-a+sve | |||
CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||
ifeq (1, $(ISCLANG)) | |||
CCOMMON_OPT += -mtune=cortex-x1 | |||
else | |||
@@ -111,7 +116,7 @@ ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
endif | |||
else | |||
CCOMMON_OPT += -march=armv8.4-a+sve | |||
CCOMMON_OPT += -march=armv8.4-a+sve+bf16 | |||
ifneq ($(CROSS), 1) | |||
CCOMMON_OPT += -mtune=native | |||
endif | |||
@@ -315,8 +315,8 @@ endif | |||
endif | |||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | |||
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||
endif | |||
endif | |||
@@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1 | |||
endif | |||
ifeq ($(ARCH), arm64) | |||
GEMM_GEMV_FORWARD = 1 | |||
GEMM_GEMV_FORWARD_BF16 = 1 | |||
endif | |||
ifeq ($(ARCH), riscv) | |||
GEMM_GEMV_FORWARD = 1 | |||
@@ -420,6 +421,7 @@ ifeq ($(ARCH), arm64) | |||
export MACOSX_DEPLOYMENT_TARGET=11.0 | |||
ifeq ($(C_COMPILER), GCC) | |||
export NO_SVE = 1 | |||
export NO_SME = 1 | |||
endif | |||
else | |||
export MACOSX_DEPLOYMENT_TARGET=10.8 | |||
@@ -434,6 +436,11 @@ ifeq (x$(XCVER), x 15) | |||
CCOMMON_OPT += -Wl,-ld_classic | |||
FCOMMON_OPT += -Wl,-ld_classic | |||
endif | |||
ifeq (x$(XCVER), x 16) | |||
ifeq ($(F_COMPILER), GFORTRAN) | |||
override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) | |||
endif | |||
endif | |||
endif | |||
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) | |||
@@ -709,6 +716,9 @@ DYNAMIC_CORE += NEOVERSEN2 | |||
DYNAMIC_CORE += ARMV8SVE | |||
DYNAMIC_CORE += A64FX | |||
endif | |||
ifneq ($(NO_SME), 1) | |||
DYNAMIC_CORE += ARMV9SME | |||
endif | |||
DYNAMIC_CORE += THUNDERX | |||
DYNAMIC_CORE += THUNDERX2T99 | |||
DYNAMIC_CORE += TSV110 | |||
@@ -1472,6 +1482,10 @@ ifeq ($(NO_SVE), 1) | |||
CCOMMON_OPT += -DNO_SVE | |||
endif | |||
ifeq ($(NO_SME), 1) | |||
CCOMMON_OPT += -DNO_SME | |||
endif | |||
ifdef SMP | |||
CCOMMON_OPT += -DSMP_SERVER | |||
@@ -111,6 +111,7 @@ THUNDERX3T110 | |||
VORTEX | |||
A64FX | |||
ARMV8SVE | |||
ARMV9SME | |||
FT2000 | |||
9.System Z: | |||
@@ -164,7 +164,19 @@ jobs: | |||
- script: | | |||
brew update | |||
make CC=gcc-12 FC=gfortran-12 | |||
- job: OSX_LLVM_flangnew | |||
pool: | |||
vmImage: 'macOS-latest' | |||
variables: | |||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
steps: | |||
- script: | | |||
brew update | |||
brew install llvm flang | |||
make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1 | |||
- job: OSX_OpenMP_Clang | |||
pool: | |||
vmImage: 'macOS-latest' | |||
@@ -334,6 +334,24 @@ if [ "$architecture" = "arm64" ]; then | |||
rm -rf "$tmpd" | |||
fi | |||
no_sme=0 | |||
if [ "$architecture" = "arm64" ]; then | |||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') | |||
tmpf="$tmpd/a.S" | |||
printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf" | |||
args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf" | |||
no_sme=0 | |||
{ | |||
$compiler_name $flags $args >/dev/null 2>&1 | |||
} || { | |||
args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf" | |||
$compiler_name $flags $args >/dev/null 2>&1 | |||
} || { | |||
no_sme=1 | |||
} | |||
rm -rf "$tmpd" | |||
fi | |||
c11_atomics=0 | |||
case "$data" in | |||
*HAVE_C11*) | |||
@@ -475,6 +493,7 @@ done | |||
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" | |||
[ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" | |||
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" | |||
[ "$no_sme" -eq 1 ] && printf "NO_SME=1\n" | |||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" | |||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" | |||
@@ -31,22 +31,23 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") | |||
endif () | |||
if (USE_OPENMP) | |||
# USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
# NO_AFFINITY = 1 | |||
find_package(OpenMP REQUIRED) | |||
if (OpenMP_FOUND) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}") | |||
endif() | |||
endif () | |||
if (DYNAMIC_ARCH) | |||
if (ARM64) | |||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
endif () | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||
endif() | |||
elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
endif () | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 | |||
set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||
endif() | |||
endif () | |||
if (DYNAMIC_LIST) | |||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
@@ -84,7 +84,7 @@ endif () | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") | |||
if (POWER) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") | |||
else () | |||
elseif (X86_64) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") | |||
endif () | |||
endif () | |||
@@ -182,7 +182,9 @@ endif () | |||
if (${CORE} STREQUAL A64FX) | |||
if (NOT DYNAMIC_ARCH) | |||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx") | |||
elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
@@ -194,6 +196,8 @@ if (${CORE} STREQUAL NEOVERSEN2) | |||
if (NOT DYNAMIC_ARCH) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") | |||
else () | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
@@ -208,6 +212,8 @@ if (${CORE} STREQUAL NEOVERSEV1) | |||
if (NOT DYNAMIC_ARCH) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") | |||
else () | |||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
@@ -220,10 +226,12 @@ endif () | |||
if (${CORE} STREQUAL NEOVERSEN1) | |||
if (NOT DYNAMIC_ARCH) | |||
if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") | |||
elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") | |||
endif() | |||
endif () | |||
endif () | |||
@@ -232,21 +240,33 @@ if (${CORE} STREQUAL ARMV8SVE) | |||
if (NOT DYNAMIC_ARCH) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | |||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
endif () | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL ARMV9SME) | |||
if (NOT DYNAMIC_ARCH) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") | |||
endif () | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL CORTEXA510) | |||
if (NOT DYNAMIC_ARCH) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL CORTEXA710) | |||
if (NOT DYNAMIC_ARCH) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
endif () | |||
endif () | |||
@@ -258,7 +278,7 @@ endif () | |||
if (${CORE} STREQUAL CORTEXX2) | |||
if (NOT DYNAMIC_ARCH) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
endif () | |||
endif () | |||
@@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L | |||
# This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | |||
endif () | |||
@@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
endif () | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") | |||
endif () | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
if (${F_COMPILER} STREQUAL "FUJITSU") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -q32") | |||
endif () | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") | |||
endif () | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE") | |||
endif () | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64") | |||
if (USE_OPENMP) | |||
set(FEXTRALIB "${FEXTRALIB} -lstdc++") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -mp") | |||
set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
endif () | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") | |||
set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
if (${F_COMPILER} STREQUAL "COMPAQ") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY") | |||
if (NOT USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") | |||
else () | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR") | |||
# -w=unused: Suppress warning messages about unused variables | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") | |||
endif () | |||
endif () | |||
@@ -79,6 +79,9 @@ macro(SetDefaultL1) | |||
SetFallback(CROTKERNEL zrot.S) | |||
SetFallback(ZROTKERNEL zrot.S) | |||
SetFallback(XROTKERNEL zrot.S) | |||
SetFallback(SROTMKERNEL rotm.S) | |||
SetFallback(DROTMKERNEL rotm.S) | |||
SetFallback(QROTMKERNEL rotm.S) | |||
SetFallback(SSCALKERNEL scal.S) | |||
SetFallback(DSCALKERNEL scal.S) | |||
SetFallback(CSCALKERNEL zscal.S) | |||
@@ -98,6 +98,8 @@ set(CSRC | |||
lapacke_cgesv_work.c | |||
lapacke_cgesvd.c | |||
lapacke_cgesvd_work.c | |||
lapacke_cgesvdq.c | |||
lapacke_cgesvdq_work.c | |||
lapacke_cgesvdx.c | |||
lapacke_cgesvdx_work.c | |||
lapacke_cgesvj.c | |||
@@ -1766,8 +1768,8 @@ set(SSRC | |||
lapacke_strsna_work.c | |||
lapacke_strsyl.c | |||
lapacke_strsyl_work.c | |||
lapacke_ctrsyl3.c | |||
lapacke_ctrsyl3_work.c | |||
lapacke_strsyl3.c | |||
lapacke_strsyl3_work.c | |||
lapacke_strtri.c | |||
lapacke_strtri_work.c | |||
lapacke_strtrs.c | |||
@@ -2410,10 +2412,10 @@ set(ZSRC | |||
lapacke_ilaver.c | |||
) | |||
if (BUILD_LAPACK_DEPRECATED) | |||
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||
list(APPEND SSRC lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) | |||
list(APPEND DSRC lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) | |||
list(APPEND CSRC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) | |||
list(APPEND ZSRC lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) | |||
endif() | |||
set(SRCX | |||
@@ -1006,15 +1006,15 @@ endif () | |||
"#define HAVE_SVE\n" | |||
"#define ARMV8\n") | |||
set(SGEMM_UNROLL_M 16) | |||
set(SGEMM_UNROLL_N 4) | |||
set(DGEMM_UNROLL_M 8) | |||
set(DGEMM_UNROLL_N 4) | |||
set(CGEMM_UNROLL_M 8) | |||
set(SGEMM_UNROLL_N 8) | |||
set(DGEMM_UNROLL_M 4) | |||
set(DGEMM_UNROLL_N 8) | |||
set(CGEMM_UNROLL_M 2) | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_M 2) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||
elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_CODE_SIZE\t65536\n" | |||
"#define L1_CODE_LINESIZE\t64\n" | |||
@@ -1249,6 +1249,25 @@ endif () | |||
set(ZGEMM_UNROLL_M 2) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_DATA_SIZE\t32768\n" | |||
"#define L1_DATA_LINESIZE\t64\n" | |||
"#define L2_SIZE\t262144\n" | |||
"#define L2_LINESIZE\t64\n" | |||
"#define DTB_DEFAULT_ENTRIES\t64\n" | |||
"#define DTB_SIZE\t4096\n" | |||
"#define L2_ASSOCIATIVE\t32\n" | |||
"#define ARMV8\n") | |||
set(SGEMM_UNROLL_M 4) | |||
set(SGEMM_UNROLL_N 8) | |||
set(DGEMM_UNROLL_M 4) | |||
set(DGEMM_UNROLL_N 8) | |||
set(CGEMM_UNROLL_M 2) | |||
set(CGEMM_UNROLL_N 4) | |||
set(ZGEMM_UNROLL_M 2) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "P5600") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L2_SIZE 1048576\n" | |||
@@ -1409,9 +1428,11 @@ endif () | |||
# GetArch_2nd | |||
foreach(float_char S;D;Q;C;Z;X) | |||
if (NOT DEFINED ${float_char}GEMM_UNROLL_M) | |||
message(STATUS "setting unrollm=2") | |||
set(${float_char}GEMM_UNROLL_M 2) | |||
endif() | |||
if (NOT DEFINED ${float_char}GEMM_UNROLL_N) | |||
message(STATUS "setting unrolln=2") | |||
set(${float_char}GEMM_UNROLL_N 2) | |||
endif() | |||
endforeach() | |||
@@ -21,7 +21,15 @@ endif() | |||
# Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? | |||
# It seems we are meant to use TARGET as input and CORE internally as kernel. | |||
if(NOT DEFINED CORE AND DEFINED TARGET) | |||
set(CORE ${TARGET}) | |||
if (${TARGET} STREQUAL "LOONGSON3R5") | |||
set(CORE "LA464") | |||
elseif (${TARGET} STREQUAL "LOONGSON2K1000") | |||
set(CORE "LA264") | |||
elseif (${TARGET} STREQUAL "LOONGSONGENERIC") | |||
set(CORE "LA64_GENERIC)") | |||
else () | |||
set(CORE ${TARGET}) | |||
endif() | |||
endif() | |||
# TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | |||
@@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL ARMV9SME) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") | |||
endif() | |||
if (${TARGET} STREQUAL A64FX) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | |||
@@ -361,6 +372,20 @@ else () | |||
endif () | |||
endif () | |||
if (USE_OPENMP) | |||
find_package(OpenMP COMPONENTS C REQUIRED) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") | |||
if (NOT NOFORTRAN) | |||
find_package(OpenMP COMPONENTS Fortran REQUIRED) | |||
# Avoid mixed OpenMP linkage | |||
get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) | |||
if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) | |||
message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") | |||
endif() | |||
endif () | |||
endif () | |||
if (BINARY64) | |||
if (INTERFACE64) | |||
# CCOMMON_OPT += -DUSE64BITINT | |||
@@ -620,6 +645,18 @@ set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||
endif() | |||
# TODO: not sure what PFLAGS is -hpa | |||
set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") | |||
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
if ("${F_COMPILER}" STREQUAL "FLANG") | |||
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
endif () | |||
endif () | |||
if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") | |||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") | |||
endif () | |||
endif () | |||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") | |||
# TODO: not sure what FPFLAGS is -hpa | |||
@@ -632,20 +669,11 @@ if (LAPACK_STRLEN) | |||
endif() | |||
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") | |||
#Disable -fopenmp for LAPACK Fortran codes on Windows. | |||
if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel") | |||
foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
endforeach () | |||
endif () | |||
if (CMAKE_Fortran_COMPILER) | |||
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
message(STATUS "removing fortran flags") | |||
message(STATUS "removing fortran flags not supported by the compiler") | |||
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | |||
endif () | |||
foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
@@ -676,13 +704,6 @@ if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL | |||
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") | |||
endif () | |||
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
if ("${F_COMPILER}" STREQUAL "FLANG") | |||
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
endif () | |||
endif () | |||
endif () | |||
if (NOT DEFINED SUFFIX) | |||
set(SUFFIX o) | |||
@@ -139,6 +139,17 @@ endif() | |||
endif() | |||
endif() | |||
if (ARM64) | |||
if (NOT NO_SME) | |||
file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) | |||
if (NO_SME EQUAL 1) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") | |||
endif() | |||
file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o") | |||
endif() | |||
endif() | |||
include(CheckIncludeFile) | |||
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) | |||
if (HAVE_C11 EQUAL 1) | |||
@@ -16,6 +16,14 @@ endfunction () | |||
macro(ParseMakefileVars MAKEFILE_IN) | |||
message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | |||
set (OSNAME ${CMAKE_SYSTEM_NAME}) | |||
if (${C_COMPILER} MATCHES Clang) | |||
set (C_COMPILER CLANG) | |||
endif () | |||
if (${OSNAME} STREQUAL Windows) | |||
set (OSNAME WINNT) | |||
endif () | |||
message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) | |||
set (IfElse 0) | |||
set (ElseSeen 0) | |||
set (SkipIfs 0) | |||
@@ -702,6 +702,7 @@ void gotoblas_profile_init(void); | |||
void gotoblas_profile_quit(void); | |||
int support_avx512(void); | |||
int support_sme1(void); | |||
#ifdef USE_OPENMP | |||
@@ -175,7 +175,7 @@ REALNAME: | |||
#define HUGE_PAGESIZE ( 4 << 20) | |||
#ifndef BUFFERSIZE | |||
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) | |||
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) | |||
#define BUFFER_SIZE (32 << 22) | |||
#else | |||
#define BUFFER_SIZE (32 << 20) | |||
@@ -22,6 +22,7 @@ | |||
#define DSUM_K dsum_k | |||
#define DSWAP_K dswap_k | |||
#define DROT_K drot_k | |||
#define DROTM_K drotm_k | |||
#define DGEMV_N dgemv_n | |||
#define DGEMV_T dgemv_t | |||
@@ -180,6 +181,7 @@ | |||
#define DSUM_K gotoblas -> dsum_k | |||
#define DSWAP_K gotoblas -> dswap_k | |||
#define DROT_K gotoblas -> drot_k | |||
#define DROTM_K gotoblas -> drotm_k | |||
#define DGEMV_N gotoblas -> dgemv_n | |||
#define DGEMV_T gotoblas -> dgemv_t | |||
@@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); | |||
int drotmg_k(double *, double *, double *, double *, double *); | |||
int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | |||
int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); | |||
int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); | |||
int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); | |||
int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | |||
@@ -70,6 +70,7 @@ | |||
#define SUM_K QSUM_K | |||
#define SWAP_K QSWAP_K | |||
#define ROT_K QROT_K | |||
#define ROTM_K QROTM_K | |||
#define GEMV_N QGEMV_N | |||
#define GEMV_T QGEMV_T | |||
@@ -361,6 +362,7 @@ | |||
#define SUM_K DSUM_K | |||
#define SWAP_K DSWAP_K | |||
#define ROT_K DROT_K | |||
#define ROTM_K DROTM_K | |||
#define GEMV_N DGEMV_N | |||
#define GEMV_T DGEMV_T | |||
@@ -977,6 +979,7 @@ | |||
#define SUM_K SSUM_K | |||
#define SWAP_K SSWAP_K | |||
#define ROT_K SROT_K | |||
#define ROTM_K SROTM_K | |||
#define GEMV_N SGEMV_N | |||
#define GEMV_T SGEMV_T | |||
@@ -77,6 +77,7 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||
double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -197,6 +198,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
#endif | |||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | |||
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -221,6 +223,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||
#endif | |||
#ifdef ARCH_ARM64 | |||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
#endif | |||
int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -330,6 +336,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
#endif | |||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | |||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
@@ -439,6 +446,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
@@ -22,6 +22,7 @@ | |||
#define QSUM_K qsum_k | |||
#define QSWAP_K qswap_k | |||
#define QROT_K qrot_k | |||
#define QROTM_K qrotm_k | |||
#define QGEMV_N qgemv_n | |||
#define QGEMV_T qgemv_t | |||
@@ -165,6 +166,7 @@ | |||
#define QSUM_K gotoblas -> qsum_k | |||
#define QSWAP_K gotoblas -> qswap_k | |||
#define QROT_K gotoblas -> qrot_k | |||
#define QROTM_K gotoblas -> qrotm_k | |||
#define QGEMV_N gotoblas -> qgemv_n | |||
#define QGEMV_T gotoblas -> qgemv_t | |||
@@ -24,6 +24,7 @@ | |||
#define SSCAL_K sscal_k | |||
#define SSWAP_K sswap_k | |||
#define SROT_K srot_k | |||
#define SROTM_K srotm_k | |||
#define SGEMV_N sgemv_n | |||
#define SGEMV_T sgemv_t | |||
@@ -189,6 +190,7 @@ | |||
#define SSCAL_K gotoblas -> sscal_k | |||
#define SSWAP_K gotoblas -> sswap_k | |||
#define SROT_K gotoblas -> srot_k | |||
#define SROTM_K gotoblas -> srotm_k | |||
#define SGEMV_N gotoblas -> sgemv_n | |||
#define SGEMV_T gotoblas -> sgemv_t | |||
@@ -213,9 +215,9 @@ | |||
#ifdef ARCH_X86_64 | |||
#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | |||
#define SGEMM_DIRECT gotoblas -> sgemm_direct | |||
#else | |||
#elif ARCH_ARM64 | |||
#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||
#define SGEMM_DIRECT sgemm_direct | |||
#define SGEMM_DIRECT gotoblas -> sgemm_direct | |||
#endif | |||
#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | |||
@@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
.global REALNAME ;\ | |||
.type REALNAME, %function ;\ | |||
REALNAME: | |||
#define EPILOGUE | |||
#if defined(__ELF__) && defined(__linux__) | |||
# define GNUSTACK .section .note.GNU-stack,"",@progbits | |||
#else | |||
# define GNUSTACK | |||
#endif | |||
#define EPILOGUE \ | |||
.size REALNAME, .-REALNAME; \ | |||
GNUSTACK | |||
#define PROFCODE | |||
@@ -65,3 +65,6 @@ _cpuid: | |||
.subsections_via_symbols | |||
#endif | |||
#if defined(__ELF__) && defined(__linux__) | |||
.section .note.GNU-stack,"",@progbits | |||
#endif |
@@ -43,6 +43,9 @@ size_t length64=sizeof(value64); | |||
#ifndef HWCAP_SVE | |||
#define HWCAP_SVE (1 << 22) | |||
#endif | |||
#if (defined OS_WINDOWS) | |||
#include <winreg.h> | |||
#endif | |||
#define get_cpu_ftr(id, var) ({ \ | |||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
@@ -371,20 +374,47 @@ int detect(void) | |||
} | |||
#else | |||
#ifdef __APPLE__ | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | |||
cpulowperf=value64; | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | |||
if (value64 > 1) { | |||
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); | |||
cpuhiperf=value64; | |||
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); | |||
cpulowperf=value64; | |||
} | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | |||
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | |||
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 | |||
if (value64 == 1867590060) return CPU_VORTEX; //M4 | |||
#else | |||
#ifdef OS_WINDOWS | |||
HKEY reghandle; | |||
HKEY hklm = HKEY_LOCAL_MACHINE; | |||
WCHAR valstring[512]; | |||
PVOID pvalstring=valstring; | |||
DWORD size=sizeof (valstring); | |||
DWORD type=RRF_RT_ANY; | |||
DWORD flags=0; | |||
LPCWSTR subkey= L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"; | |||
LPCWSTR field=L"ProcessorNameString"; | |||
LONG errcode=RegOpenKeyEx(HKEY_LOCAL_MACHINE,TEXT("Hardware\\Description\\System\\CentralProcessor\\0"), 0, KEY_READ, ®handle); | |||
if (errcode != NO_ERROR) wprintf(L"Could not open registry key for proc0: %x\n",errcode); | |||
errcode=RegQueryValueEx(reghandle, "ProcessorNameString", NULL,NULL ,pvalstring,&size); | |||
if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode); | |||
//wprintf(stderr,L"%s\n",(PWSTR)valstring); | |||
RegCloseKey(reghandle); | |||
if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1; | |||
if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1; | |||
if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1; | |||
if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1; | |||
#endif | |||
#endif | |||
return CPU_ARMV8; | |||
#endif | |||
@@ -442,6 +472,7 @@ int n=0; | |||
printf("#define NUM_CORES_HP %d\n",cpuhiperf); | |||
#endif | |||
#ifdef __APPLE__ | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
printf("#define NUM_CORES %d\n",value); | |||
if (cpulowperf >0) | |||
@@ -673,12 +704,17 @@ void get_cpuconfig(void) | |||
case CPU_VORTEX: | |||
printf("#define VORTEX \n"); | |||
#ifdef __APPLE__ | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
printf("#define L1_CODE_SIZE %lld \n",value64); | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
printf("#define L1_DATA_LINESIZE %lld \n",value64); | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
printf("#define L1_DATA_SIZE %lld \n",value64); | |||
length64 = sizeof(value64); | |||
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
printf("#define L2_SIZE %lld \n",value64); | |||
#endif | |||
@@ -1578,6 +1578,7 @@ int get_cpuname(void){ | |||
case 12: //family 6 exmodel 12 | |||
switch (model) { | |||
case 15: | |||
case 6: // Arrow Lake | |||
if(support_avx512()) | |||
return CPUTYPE_SAPPHIRERAPIDS; | |||
if(support_avx2()) | |||
@@ -2421,6 +2422,22 @@ int get_coretype(void){ | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
case 12: | |||
switch (model) { | |||
case 6: // Arrow Lake | |||
if(support_amx_bf16()) | |||
return CORE_SAPPHIRERAPIDS; | |||
if(support_avx512_bf16()) | |||
return CORE_COOPERLAKE; | |||
if(support_avx512()) | |||
return CORE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
if(support_avx()) | |||
return CORE_SANDYBRIDGE; | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
} | |||
case 15: | |||
if (model <= 0x2) return CORE_NORTHWOOD; | |||
@@ -6,7 +6,7 @@ enable_language(Fortran) | |||
endif() | |||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | |||
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) | |||
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1) | |||
list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) | |||
set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) | |||
endif() | |||
@@ -44,10 +44,6 @@ else() | |||
c_${float_char}blas1.c) | |||
endif() | |||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat1 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat1 m) | |||
endif() | |||
@@ -73,10 +69,6 @@ else() | |||
constant.c) | |||
endif() | |||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat2 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat2 m) | |||
endif() | |||
@@ -124,20 +116,12 @@ else() | |||
endif() | |||
endif() | |||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat3 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat3 m) | |||
endif() | |||
if (USE_GEMM3M) | |||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | |||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
target_link_libraries(x${float_char}cblat3 omp pthread) | |||
endif() | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
target_link_libraries(x${float_char}cblat3_3m m) | |||
endif() | |||
@@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
ifeq ($(USE_OPENMP), 1) | |||
ifeq ($(F_COMPILER), GFORTRAN) | |||
ifeq ($(C_COMPILER), CLANG) | |||
CEXTRALIB += -lomp | |||
EXTRALIB += -lomp | |||
endif | |||
endif | |||
ifeq ($(F_COMPILER), NAG) | |||
CEXTRALIB = -lgomp | |||
EXTRALIB = -lgomp | |||
endif | |||
ifeq ($(F_COMPILER), IBM) | |||
ifeq ($(C_COMPILER), GCC) | |||
CEXTRALIB += -lgomp | |||
EXTRALIB += -lgomp | |||
endif | |||
ifeq ($(C_COMPILER), CLANG) | |||
CEXTRALIB += -lomp | |||
EXTRALIB += -lomp | |||
endif | |||
endif | |||
endif | |||
@@ -440,7 +440,7 @@ static real c_b43 = (float)1.; | |||
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | |||
static complex mwpcs[5], mwpct[5]; | |||
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | |||
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||
extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*); | |||
static complex cx[8]; | |||
extern real scnrm2test_(integer*, complex*, integer*); | |||
static integer np1; | |||
@@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a | |||
4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | |||
by invoking Ninja: | |||
```cmd | |||
cd OpenBLAS | |||
mkdir build | |||
cd build | |||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new | |||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new | |||
ninja -j16 | |||
``` | |||
@@ -223,3 +223,7 @@ if (USE_THREAD) | |||
endif () | |||
add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) | |||
if (USE_OPENMP) | |||
target_link_libraries(driver_level2 OpenMP::OpenMP_C) | |||
endif() |
@@ -171,3 +171,7 @@ endforeach () | |||
# | |||
add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) | |||
if (USE_OPENMP) | |||
target_link_libraries(driver_level3 OpenMP::OpenMP_C) | |||
endif() |
@@ -547,7 +547,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
#ifdef USE_OPENMP | |||
static omp_lock_t level3_lock, critical_section_lock; | |||
static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, | |||
static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, | |||
parallel_section_left = MAX_PARALLEL_NUMBER; | |||
// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c | |||
@@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
BLASLONG nthreads = args -> nthreads; | |||
BLASLONG width, i, j, k, js; | |||
BLASLONG width, width_n, i, j, k, js; | |||
BLASLONG m, n, n_from, n_to; | |||
int mode; | |||
#if defined(DYNAMIC_ARCH) | |||
@@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
/* Partition (a step of) n into nthreads regions */ | |||
range_N[0] = js; | |||
num_parts = 0; | |||
while (n > 0){ | |||
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); | |||
if (width < switch_ratio && width > 1) { | |||
width = switch_ratio; | |||
for(j = 0; j < nthreads_n; j++){ | |||
width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j); | |||
n -= width_n; | |||
for(i = 0; i < nthreads_m; i++){ | |||
width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i); | |||
if (width < switch_ratio) { | |||
width = switch_ratio; | |||
} | |||
width = round_up(width_n, width, GEMM_PREFERED_SIZE); | |||
width_n -= width; | |||
if (width_n < 0) { | |||
width = width + width_n; | |||
width_n = 0; | |||
} | |||
range_N[num_parts + 1] = range_N[num_parts] + width; | |||
num_parts ++; | |||
} | |||
width = round_up(n, width, GEMM_PREFERED_SIZE); | |||
n -= width; | |||
if (n < 0) width = width + n; | |||
range_N[num_parts + 1] = range_N[num_parts] + width; | |||
num_parts ++; | |||
} | |||
for (j = num_parts; j < MAX_CPU_NUMBER; j++) { | |||
range_N[j + 1] = range_N[num_parts]; | |||
@@ -844,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||
/* Objective function come from sum of partitions in m and n. */ | |||
/* (n / nthreads_n) + (m / nthreads_m) */ | |||
/* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ | |||
while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { | |||
nthreads_m /= 2; | |||
nthreads_n *= 2; | |||
BLASLONG cost = 0, div = 0; | |||
BLASLONG i; | |||
for (i = 1; i <= sqrt(nthreads_m); i++) { | |||
if (nthreads_m % i) continue; | |||
BLASLONG j = nthreads_m / i; | |||
BLASLONG cost_i = n * j + m * nthreads_n * i; | |||
BLASLONG cost_j = n * i + m * nthreads_n * j; | |||
if (cost == 0 || | |||
cost_i < cost) {cost = cost_i; div = i;} | |||
if (cost_j < cost) {cost = cost_j; div = j;} | |||
} | |||
if (div > 1) { | |||
nthreads_m /= div; | |||
nthreads_n *= div; | |||
} | |||
} | |||
@@ -88,3 +88,7 @@ endif () | |||
#endif | |||
add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) | |||
if (USE_OPENMP) | |||
target_link_libraries(driver_others OpenMP::OpenMP_C) | |||
endif() |
@@ -146,8 +146,8 @@ typedef struct { | |||
} thread_status_t; | |||
#ifdef HAVE_C11 | |||
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) | |||
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) | |||
#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_ACQUIRE) | |||
#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE) | |||
#else | |||
#define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) | |||
#define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) | |||
@@ -637,7 +637,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
#ifdef SMP_SERVER | |||
// Handle lazy re-init of the thread-pool after a POSIX fork | |||
LOCK_COMMAND(&server_lock); | |||
if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
UNLOCK_COMMAND(&server_lock); | |||
#endif | |||
BLASLONG i = 0; | |||
blas_queue_t *current = queue; | |||
@@ -43,6 +43,14 @@ | |||
#include <sys/auxv.h> | |||
#endif | |||
#ifdef __APPLE__ | |||
#include <sys/sysctl.h> | |||
int32_t value; | |||
size_t length=sizeof(value); | |||
int64_t value64; | |||
size_t length64=sizeof(value64); | |||
#endif | |||
extern gotoblas_t gotoblas_ARMV8; | |||
#ifdef DYNAMIC_LIST | |||
#ifdef DYN_CORTEXA53 | |||
@@ -115,7 +123,12 @@ extern gotoblas_t gotoblas_ARMV8SVE; | |||
#else | |||
#define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
#endif | |||
#ifdef DYN_CORTEX_A55 | |||
#ifdef DYN_ARMV9SME | |||
extern gotoblas_t gotoblas_ARMV9SME; | |||
#else | |||
#define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
#endif | |||
#ifdef DYN_CORTEXA55 | |||
extern gotoblas_t gotoblas_CORTEXA55; | |||
#else | |||
#define gotoblas_CORTEXA55 gotoblas_ARMV8 | |||
@@ -142,21 +155,28 @@ extern gotoblas_t gotoblas_NEOVERSEV1; | |||
extern gotoblas_t gotoblas_NEOVERSEN2; | |||
extern gotoblas_t gotoblas_ARMV8SVE; | |||
extern gotoblas_t gotoblas_A64FX; | |||
#ifndef NO_SME | |||
extern gotoblas_t gotoblas_ARMV9SME; | |||
#else | |||
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||
#endif | |||
#else | |||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
#define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
#define gotoblas_A64FX gotoblas_ARMV8 | |||
#define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
#endif | |||
extern gotoblas_t gotoblas_THUNDERX3T110; | |||
#endif | |||
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 | |||
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEN2 | |||
extern void openblas_warning(int verbose, const char * msg); | |||
#define FALLBACK_VERBOSE 1 | |||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
#define NUM_CORETYPES 18 | |||
#define NUM_CORETYPES 19 | |||
/* | |||
* In case asm/hwcap.h is outdated on the build system, make sure | |||
@@ -168,6 +188,9 @@ extern void openblas_warning(int verbose, const char * msg); | |||
#ifndef HWCAP_SVE | |||
#define HWCAP_SVE (1 << 22) | |||
#endif | |||
#ifndef HWCAP2_SME | |||
#define HWCAP2_SME 1<<23 | |||
#endif | |||
#define get_cpu_ftr(id, var) ({ \ | |||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
@@ -192,6 +215,7 @@ static char *corename[] = { | |||
"cortexa55", | |||
"armv8sve", | |||
"a64fx", | |||
"armv9sme", | |||
"unknown" | |||
}; | |||
@@ -214,6 +238,7 @@ char *gotoblas_corename(void) { | |||
if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | |||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | |||
if (gotoblas == &gotoblas_A64FX) return corename[17]; | |||
if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; | |||
return corename[NUM_CORETYPES]; | |||
} | |||
@@ -251,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
case 15: return (&gotoblas_CORTEXA55); | |||
case 16: return (&gotoblas_ARMV8SVE); | |||
case 17: return (&gotoblas_A64FX); | |||
case 18: return (&gotoblas_ARMV9SME); | |||
} | |||
snprintf(message, 128, "Core not found: %s\n", coretype); | |||
openblas_warning(1, message); | |||
@@ -262,6 +288,11 @@ static gotoblas_t *get_coretype(void) { | |||
char coremsg[128]; | |||
#if defined (OS_DARWIN) | |||
//future #if !defined(NO_SME) | |||
// if (support_sme1()) { | |||
// return &gotoblas_ARMV9SME; | |||
// } | |||
// #endif | |||
return &gotoblas_NEOVERSEN1; | |||
#endif | |||
@@ -424,12 +455,20 @@ static gotoblas_t *get_coretype(void) { | |||
} | |||
break; | |||
case 0x61: // Apple | |||
//future if (support_sme1()) return &gotoblas_ARMV9SME; | |||
return &gotoblas_NEOVERSEN1; | |||
break; | |||
default: | |||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
openblas_warning(1, coremsg); | |||
} | |||
#if !defined(NO_SME) | |||
if (support_sme1()) { | |||
return &gotoblas_ARMV9SME; | |||
} | |||
#endif | |||
#ifndef NO_SVE | |||
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
return &gotoblas_ARMV8SVE; | |||
@@ -480,3 +519,19 @@ void gotoblas_dynamic_init(void) { | |||
void gotoblas_dynamic_quit(void) { | |||
gotoblas = NULL; | |||
} | |||
int support_sme1(void) { | |||
int ret = 0; | |||
#if (defined OS_LINUX || defined OS_ANDROID) | |||
ret = getauxval(AT_HWCAP2) & HWCAP2_SME; | |||
if(getauxval(AT_HWCAP2) & HWCAP2_SME){ | |||
ret = 1; | |||
} | |||
#endif | |||
#if defined(__APPLE__) | |||
sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0); | |||
ret = value64; | |||
#endif | |||
return ret; | |||
} |
@@ -197,7 +197,7 @@ ifeq ($(F_COMPILER), INTEL) | |||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
else ifeq ($(F_COMPILER), FLANG) | |||
else ifeq ($(F_COMPILER), $(filter $(F_COMPILER),FLANG FLANGNEW)) | |||
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
-Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
@@ -21,7 +21,7 @@ | |||
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, | |||
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, | |||
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, | |||
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); | |||
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr); | |||
@blasobjsd = ( | |||
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, | |||
@@ -29,7 +29,7 @@ | |||
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, | |||
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, | |||
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, | |||
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); | |||
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr); | |||
@blasobjss = ( | |||
isamax,isamin,ismax,ismin, | |||
@@ -38,7 +38,7 @@ | |||
smax,smin,snrm2,simatcopy,somatcopy, | |||
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | |||
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | |||
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); | |||
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr); | |||
@blasobjsz = ( | |||
izamax,izamin,, | |||
@@ -48,28 +48,29 @@ | |||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | |||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | |||
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, | |||
zgeadd, dzsum, zgemmt); | |||
zgeadd, dzsum, zgemmt,zgemmtr); | |||
@blasobjs = (lsame, xerbla); | |||
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); | |||
@cblasobjsc = ( | |||
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, | |||
cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, | |||
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, | |||
cblas_scnrm2, cblas_scasum, | |||
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy | |||
cblas_cgemmt); | |||
cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr, | |||
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, | |||
cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch); | |||
@cblasobjsd = ( | |||
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, | |||
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, | |||
cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, | |||
cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, | |||
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, | |||
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, | |||
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy | |||
cblas_dgemmt); | |||
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr, | |||
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, | |||
cblas_damax, cblas_damin, cblas_dgemm_batch); | |||
@cblasobjss = ( | |||
cblas_sasum, cblas_saxpy, cblas_saxpby, | |||
@@ -78,9 +79,10 @@ | |||
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | |||
cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | |||
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | |||
cblas_strsv, cblas_sgeadd, | |||
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy | |||
cblas_sgemmt); | |||
cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr, | |||
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, | |||
cblas_samax, cblas_samin, cblas_sgemm_batch); | |||
@cblasobjsz = ( | |||
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, | |||
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, | |||
@@ -88,13 +90,13 @@ | |||
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, | |||
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, | |||
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, | |||
cblas_zaxpby, cblas_zgeadd, | |||
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy | |||
cblas_zgemmt); | |||
cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr, | |||
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, | |||
cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch); | |||
@cblasobjs = ( cblas_xerbla ); | |||
@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); | |||
@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); | |||
@exblasobjs = ( | |||
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
@@ -709,6 +711,7 @@ zpotri, | |||
# functions added for lapack-3.7.0 | |||
@lapackobjs2s = (@lapackobjs2s, | |||
slarfy, | |||
ssyconvf, | |||
strevc3, | |||
sgelqt, | |||
sgelqt3, | |||
@@ -832,12 +835,82 @@ zpotri, | |||
zungtsqr_row | |||
); | |||
#functions added for lapack-3.11 | |||
@lapackobjs2c = (@lapackobjs2c, | |||
cgedmd, | |||
cgedmdq | |||
); | |||
@lapackobjs2d = (@lapackobjs2d, | |||
dgedmd, | |||
dgedmdq | |||
); | |||
@lapackobjs2s = (@lapackobjs2s, | |||
sgedmd, | |||
sgedmdq | |||
); | |||
@lapackobjs2z = (@lapackobjs2z, | |||
zgedmd, | |||
zgedmdq | |||
); | |||
#functions added post 3.11 | |||
@lapackobjs2c = (@lapackobjs2c, | |||
cgelst, | |||
cgeqp3rk, | |||
claqp2rk, | |||
claqp3rk, | |||
clatrs3, | |||
crscl, | |||
ctrsyl3 | |||
); | |||
# claqz0 | |||
# claqz1 | |||
# claqz2 | |||
# claqz3 | |||
# clatrs3 | |||
@lapackobjs2d = (@lapackobjs2d, | |||
dgelst, | |||
dgeqp3rk, | |||
dlaqp2rk, | |||
dlaqp3rk, | |||
dlarmm, | |||
dlatrs3, | |||
dtrsyl3 | |||
); | |||
@lapackobjs2s = (@lapackobjs2s, | |||
sgelst, | |||
sgeqp3rk, | |||
slaqp2rk, | |||
slaqp3rk, | |||
slarmm, | |||
slatrs3, | |||
strsyl3 | |||
); | |||
@lapackobjs2z = (@lapackobjs2z, | |||
zgelst, | |||
zgeqp3rk, | |||
zlaqp2rk, | |||
zlaqp3rk, | |||
zlatrs3, | |||
zrscl, | |||
ztrsyl3 | |||
); | |||
# zlaqz0 | |||
# zlaqz1 | |||
# zlaqz2 | |||
# zlaqz3 | |||
@lapack_extendedprecision_objs = ( | |||
zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, | |||
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, | |||
); | |||
@lapack_deprecated_objsc = ( | |||
cgelqs, cgeqrs, | |||
cgegs, cggsvd, | |||
cgegv, cggsvp, | |||
cgelsx, clahrd, | |||
@@ -845,13 +918,16 @@ zpotri, | |||
ctzrqf, | |||
); | |||
@lapack_deprecated_objsd = ( | |||
dgelqs, dgeqrs, | |||
dgegs, dgeqpf, | |||
dgegv, dggsvd, | |||
dgelsx, dggsvp, | |||
dlahrd, | |||
dlatzm, dtzrqf); | |||
@lapack_deprecated_objss = ( | |||
@lapack_deprecated_objss = ( | |||
sgelqs, | |||
sgeqrs, | |||
sgelsx, | |||
sgegs, | |||
sgegv, | |||
@@ -864,6 +940,8 @@ zpotri, | |||
); | |||
@lapack_deprecated_objsz = ( | |||
zgelqs, | |||
zgeqrs, | |||
zgegs, | |||
zgegv, | |||
zgelsx, | |||
@@ -997,6 +1075,10 @@ zpotri, | |||
LAPACKE_cgebrd_work, | |||
LAPACKE_cgecon, | |||
LAPACKE_cgecon_work, | |||
LAPACKE_cgedmd, | |||
LAPACKE_cgedmd_work, | |||
LAPACKE_cgedmdq, | |||
LAPACKE_cgedmdq_work, | |||
LAPACKE_cgeequ, | |||
LAPACKE_cgeequ_work, | |||
LAPACKE_cgeequb, | |||
@@ -1584,8 +1666,15 @@ zpotri, | |||
LAPACKE_cgetsqrhrt, | |||
LAPACKE_cgetsqrhrt_work, | |||
LAPACKE_cungtsqr_row, | |||
LAPACKE_cungtsqr_row_work | |||
LAPACKE_cungtsqr_row_work, | |||
LAPACKE_clangb, | |||
LAPACKE_clangb_work, | |||
LAPACKE_ctrsyl3, | |||
LAPACKE_ctrsyl3_work, | |||
LAPACKE_ctz_nancheck, | |||
LAPACKE_ctz_trans, | |||
LAPACKE_cunhr_col, | |||
LAPACKE_cunhr_col_work | |||
); | |||
@lapackeobjsd = ( | |||
LAPACKE_dgb_nancheck, | |||
@@ -1656,6 +1745,10 @@ zpotri, | |||
LAPACKE_dgebrd_work, | |||
LAPACKE_dgecon, | |||
LAPACKE_dgecon_work, | |||
LAPACKE_dgedmd, | |||
LAPACKE_dgedmd_work, | |||
LAPACKE_dgedmdq, | |||
LAPACKE_dgedmdq_work, | |||
LAPACKE_dgeequ, | |||
LAPACKE_dgeequ_work, | |||
LAPACKE_dgeequb, | |||
@@ -2197,7 +2290,15 @@ zpotri, | |||
LAPACKE_dgetsqrhrt, | |||
LAPACKE_dgetsqrhrt_work, | |||
LAPACKE_dorgtsqr_row, | |||
LAPACKE_dorgtsqr_row_work | |||
LAPACKE_dorgtsqr_row_work, | |||
LAPACKE_dlangb, | |||
LAPACKE_dlangb_work, | |||
LAPACKE_dorhr_col, | |||
LAPACKE_dorhr_col_work, | |||
LAPACKE_dtrsyl3, | |||
LAPACKE_dtrsyl3_work, | |||
LAPACKE_dtz_nancheck, | |||
LAPACKE_dtz_trans, | |||
); | |||
@lapackeobjss = ( | |||
@@ -2269,6 +2370,10 @@ zpotri, | |||
LAPACKE_sgebrd_work, | |||
LAPACKE_sgecon, | |||
LAPACKE_sgecon_work, | |||
LAPACKE_sgedmd, | |||
LAPACKE_sgedmd_work, | |||
LAPACKE_sgedmdq, | |||
LAPACKE_sgedmdq_work, | |||
LAPACKE_sgeequ, | |||
LAPACKE_sgeequ_work, | |||
LAPACKE_sgeequb, | |||
@@ -2802,7 +2907,15 @@ zpotri, | |||
LAPACKE_sgetsqrhrt, | |||
LAPACKE_sgetsqrhrt_work, | |||
LAPACKE_sorgtsqr_row, | |||
LAPACKE_sorgtsqr_row_work | |||
LAPACKE_sorgtsqr_row_work, | |||
LAPACKE_slangb, | |||
LAPACKE_slangb_work, | |||
LAPACKE_sorhr_col, | |||
LAPACKE_sorhr_col_work, | |||
LAPACKE_strsyl3, | |||
LAPACKE_strsyl3_work, | |||
LAPACKE_stz_nancheck, | |||
LAPACKE_stz_trans, | |||
); | |||
@lapackeobjsz = ( | |||
@@ -2878,6 +2991,10 @@ zpotri, | |||
LAPACKE_zgebrd_work, | |||
LAPACKE_zgecon, | |||
LAPACKE_zgecon_work, | |||
LAPACKE_zgedmd, | |||
LAPACKE_zgedmd_work, | |||
LAPACKE_zgedmdq, | |||
LAPACKE_zgedmdq_work, | |||
LAPACKE_zgeequ, | |||
LAPACKE_zgeequ_work, | |||
LAPACKE_zgeequb, | |||
@@ -3345,7 +3462,15 @@ zpotri, | |||
LAPACKE_zgetsqrhrt, | |||
LAPACKE_zgetsqrhrt_work, | |||
LAPACKE_zungtsqr_row, | |||
LAPACKE_zungtsqr_row_work | |||
LAPACKE_zungtsqr_row_work, | |||
LAPACKE_zlangb, | |||
LAPACKE_zlangb_work, | |||
LAPACKE_zunhr_col, | |||
LAPACKE_zunhr_col_work, | |||
LAPACKE_ztrsyl3, | |||
LAPACKE_ztrsyl3_work, | |||
LAPACKE_ztz_nancheck, | |||
LAPACKE_ztz_trans, | |||
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | |||
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | |||
@@ -3551,7 +3676,7 @@ zpotri, | |||
LAPACKE_zsytrs_aa_2stage_work, | |||
# new functions from 3.9.0 | |||
LAPACKE_zgesvdq, | |||
LAPACKE_zgesvdq_work | |||
LAPACKE_zgesvdq_work, | |||
); | |||
#These function may need 2 underscores. | |||
@@ -3573,7 +3698,7 @@ zpotri, | |||
ssygv_2stage, | |||
ssysv_aa_2stage, ssytrf_aa_2stage, | |||
ssytrs_aa_2stage, | |||
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, | |||
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett | |||
); | |||
@lapack_embeded_underscore_objs_c=( | |||
chetf2_rook, chetrf_rook, chetri_rook, | |||
@@ -3598,7 +3723,7 @@ zpotri, | |||
chetrf_aa_2stage, chetrs_aa_2stage, | |||
csysv_aa_2stage, csytrf_aa_2stage, | |||
csytrs_aa_2stage, | |||
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, | |||
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett | |||
); | |||
@lapack_embeded_underscore_objs_d=( | |||
dlasyf_rook, | |||
@@ -3615,7 +3740,7 @@ zpotri, | |||
dsbevd_2stage, dsygv_2stage, | |||
dsysv_aa_2stage, | |||
dsytrf_aa_2stage, dsytrs_aa_2stage, | |||
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, | |||
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett | |||
); | |||
@lapack_embeded_underscore_objs_z=( | |||
zhetf2_rook, zhetrf_rook, zhetri_rook, | |||
@@ -3639,7 +3764,7 @@ zpotri, | |||
zhesv_aa_2stage, zhetrf_aa_2stage, | |||
zhetrs_aa_2stage, zsysv_aa_2stage, | |||
zsytrf_aa_2stage, zsytrs_aa_2stage, | |||
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col | |||
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett | |||
); | |||
@@ -245,6 +245,13 @@ else | |||
;; | |||
*flang*) | |||
vendor=FLANG | |||
data=`$compiler -v 2>&1 > /dev/null` | |||
v="${data#*version *}" | |||
v="${v%%*.}" | |||
major="${v%%.*}" | |||
if [ "$major" -ge 17 ]; then | |||
vendor=FLANGNEW | |||
fi | |||
bu=_ | |||
openmp='-fopenmp' | |||
;; | |||
@@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "ARMV8SVE" | |||
#endif | |||
#ifdef FORCE_ARMV9SME | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "ARMV9SME" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DARMV9SME " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" | |||
#define LIBNAME "armv9sme" | |||
#define CORENAME "ARMV9SME" | |||
#endif | |||
#ifdef FORCE_ARMV8 | |||
#define FORCE | |||
@@ -30,17 +30,17 @@ set(BLAS2_SOURCES | |||
gemv.c ger.c | |||
trsv.c trmv.c | |||
syr2.c gbmv.c | |||
sbmv.c | |||
sbmv.c spmv.c | |||
spr2.c | |||
tbsv.c tbmv.c | |||
tpsv.c tpmv.c | |||
) | |||
set(BLAS2_REAL_ONLY_SOURCES | |||
symv.c syr.c spmv.c spr.c | |||
symv.c syr.c spr.c | |||
) | |||
set(BLAS2_COMPLEX_LAPACK_SOURCES | |||
symv.c syr.c spmv.c spr.c | |||
symv.c syr.c spr.c | |||
) | |||
set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | |||
@@ -109,7 +109,7 @@ endif () | |||
GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) | |||
# gemmtr is gemmt under the name adopted by the Reference BLAS | |||
GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG}) | |||
GenerateNamedObjects("gemm.c" "RNAME" "gemmtr" ${CBLAS_FLAG}) | |||
# max and imax are compiled 4 times | |||
GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) | |||
@@ -126,7 +126,7 @@ if (BUILD_BFLOAT16) | |||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("gemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
@@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS) | |||
endforeach () | |||
endif() | |||
if (NOT DEFINED NO_LAPACK) | |||
if (NOT NO_LAPACK) | |||
set(LAPACK_SOURCES | |||
lapack/gesv.c | |||
) | |||
@@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
endif () | |||
add_library(interface OBJECT ${OPENBLAS_SRC}) | |||
if (USE_OPENMP) | |||
target_link_libraries(interface OpenMP::OpenMP_C) | |||
endif() |
@@ -1304,9 +1304,9 @@ ifeq ($(BUILD_BFLOAT16),1) | |||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
endif | |||
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
@@ -1328,34 +1328,34 @@ xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -URNAME $< -o $(@F) | |||
sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) | |||
ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
@@ -1,5 +1,5 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2024 The OpenBLAS Project */ | |||
/* Copyright 2024, 2025 The OpenBLAS Project */ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
@@ -177,6 +177,74 @@ static int init_amxtile_permission() { | |||
} | |||
#endif | |||
#ifdef SMP | |||
#ifdef DYNAMIC_ARCH | |||
extern char* gotoblas_corename(void); | |||
#endif | |||
#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { | |||
return | |||
MNK < 262144L ? 1 | |||
: MNK < 1124864L ? MIN(ncpu, 6) | |||
: MNK < 7880599L ? MIN(ncpu, 12) | |||
: MNK < 17173512L ? MIN(ncpu, 16) | |||
: MNK < 33386248L ? MIN(ncpu, 20) | |||
: MNK < 57066625L ? MIN(ncpu, 24) | |||
: MNK < 91733851L ? MIN(ncpu, 32) | |||
: MNK < 265847707L ? MIN(ncpu, 40) | |||
: MNK < 458314011L ? MIN(ncpu, 48) | |||
: MNK < 729000000L ? MIN(ncpu, 56) | |||
: ncpu; | |||
} | |||
#endif | |||
#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) | |||
static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { | |||
return | |||
MNK < 125000L ? 1 | |||
: MNK < 1092727L ? MIN(ncpu, 6) | |||
: MNK < 2628072L ? MIN(ncpu, 8) | |||
: MNK < 8000000L ? MIN(ncpu, 12) | |||
: MNK < 20346417L ? MIN(ncpu, 16) | |||
: MNK < 57066625L ? MIN(ncpu, 24) | |||
: MNK < 91125000L ? MIN(ncpu, 28) | |||
: MNK < 238328000L ? MIN(ncpu, 40) | |||
: MNK < 454756609L ? MIN(ncpu, 48) | |||
: MNK < 857375000L ? MIN(ncpu, 56) | |||
: MNK < 1073741824L ? MIN(ncpu, 64) | |||
: ncpu; | |||
} | |||
#endif | |||
static inline int get_gemm_optimal_nthreads(double MNK) { | |||
int ncpu = num_cpu_avail(3); | |||
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||
} | |||
if (strcmp(gotoblas_corename(), "neoversev2") == 0) { | |||
return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); | |||
} | |||
#endif | |||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { | |||
return 1; | |||
} | |||
else { | |||
if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { | |||
return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||
} | |||
else { | |||
return ncpu; | |||
} | |||
} | |||
} | |||
#endif | |||
#ifndef CBLAS | |||
void NAME(char *TRANSA, char *TRANSB, | |||
@@ -310,7 +378,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
FLOAT *beta = (FLOAT*) vbeta; | |||
FLOAT *a = (FLOAT*) va; | |||
FLOAT *b = (FLOAT*) vb; | |||
FLOAT *c = (FLOAT*) vc; | |||
FLOAT *c = (FLOAT*) vc; | |||
#endif | |||
blas_arg_t args; | |||
@@ -349,15 +417,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
PRINT_DEBUG_CNAME; | |||
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | |||
#ifdef DYNAMIC_ARCH | |||
if (support_avx512() ) | |||
#endif | |||
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
#if defined(DYNAMIC_ARCH) | |||
if (support_avx512() ) | |||
#endif | |||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
return; | |||
} | |||
#endif | |||
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
#if defined(DYNAMIC_ARCH) | |||
if (support_sme1()) | |||
#endif | |||
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { | |||
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
return; | |||
} | |||
#endif | |||
#endif | |||
#ifndef COMPLEX | |||
@@ -604,13 +682,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
#endif | |||
MNK = (double) args.m * (double) args.n * (double) args.k; | |||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
args.nthreads = 1; | |||
else { | |||
args.nthreads = num_cpu_avail(3); | |||
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) | |||
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||
} | |||
args.nthreads = get_gemm_optimal_nthreads(MNK); | |||
args.common = NULL; | |||
@@ -38,6 +38,17 @@ | |||
#ifndef COMPLEX | |||
#define SMP_THRESHOLD_MIN 65536.0 | |||
#ifdef RNAME | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "QGEMMTR" | |||
#elif defined(DOUBLE) | |||
#define ERROR_NAME "DGEMMTR" | |||
#elif defined(BFLOAT16) | |||
#define ERROR_NAME "SBGEMMTR" | |||
#else | |||
#define ERROR_NAME "SGEMMTR" | |||
#endif | |||
#else | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "QGEMMT " | |||
#elif defined(DOUBLE) | |||
@@ -47,8 +58,18 @@ | |||
#else | |||
#define ERROR_NAME "SGEMMT " | |||
#endif | |||
#endif | |||
#else | |||
#define SMP_THRESHOLD_MIN 8192.0 | |||
#ifdef RNAME | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "XGEMMTR" | |||
#elif defined(DOUBLE) | |||
#define ERROR_NAME "ZGEMMTR" | |||
#else | |||
#define ERROR_NAME "CGEMMTR" | |||
#endif | |||
#else | |||
#ifdef XDOUBLE | |||
#define ERROR_NAME "XGEMMT " | |||
#elif defined(DOUBLE) | |||
@@ -57,6 +78,7 @@ | |||
#define ERROR_NAME "CGEMMT " | |||
#endif | |||
#endif | |||
#endif | |||
#ifndef GEMM_MULTITHREAD_THRESHOLD | |||
#define GEMM_MULTITHREAD_THRESHOLD 4 | |||
@@ -666,5 +688,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
IDEBUG_END; | |||
/* transform B back if necessary */ | |||
#if defined(COMPLEX) | |||
if (transb > 1){ | |||
#ifndef CBLAS | |||
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
#else | |||
if (order == CblasColMajor) | |||
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
if (order == CblasRowMajor) | |||
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
#endif | |||
} | |||
#endif | |||
return; | |||
} |
@@ -63,6 +63,70 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT | |||
}; | |||
#endif | |||
#ifdef SMP | |||
#ifdef DYNAMIC_ARCH | |||
extern char* gotoblas_corename(void); | |||
#endif | |||
#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||
#ifdef DOUBLE | |||
return (MN < 8100L) ? 1 | |||
: (MN < 12100L) ? MIN(ncpu, 2) | |||
: (MN < 36100L) ? MIN(ncpu, 4) | |||
: (MN < 84100L) ? MIN(ncpu, 8) | |||
: (MN < 348100L) ? MIN(ncpu, 16) | |||
: (MN < 435600L) ? MIN(ncpu, 24) | |||
: (MN < 810000L) ? MIN(ncpu, 32) | |||
: (MN < 1050625L) ? MIN(ncpu, 40) | |||
: ncpu; | |||
#else | |||
return (MN < 25600L) ? 1 | |||
: (MN < 63001L) ? MIN(ncpu, 4) | |||
: (MN < 459684L) ? MIN(ncpu, 16) | |||
: ncpu; | |||
#endif | |||
} | |||
#endif | |||
#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) | |||
static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { | |||
return | |||
MN < 24964L ? 1 | |||
: MN < 65536L ? MIN(ncpu, 8) | |||
: MN < 262144L ? MIN(ncpu, 32) | |||
: MN < 1638400L ? MIN(ncpu, 64) | |||
: ncpu; | |||
} | |||
#endif | |||
static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | |||
int ncpu = num_cpu_avail(3); | |||
#if defined(_WIN64) && defined(_M_ARM64) | |||
if (MN > 100000000L) | |||
return num_cpu_avail(4); | |||
return 1; | |||
#endif | |||
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||
} | |||
if (strcmp(gotoblas_corename(), "neoversev2") == 0) { | |||
return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); | |||
} | |||
#endif | |||
if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
return 1; | |||
else | |||
return num_cpu_avail(2); | |||
} | |||
#endif | |||
#ifndef CBLAS | |||
void NAME(char *TRANS, blasint *M, blasint *N, | |||
@@ -202,13 +266,6 @@ void CNAME(enum CBLAS_ORDER order, | |||
if (alpha == ZERO) return; | |||
#if 0 | |||
/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ | |||
if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { | |||
GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); | |||
return; | |||
} | |||
#endif | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
@@ -225,11 +282,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
#ifdef SMP | |||
if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(2); | |||
nthreads = get_gemv_optimal_nthreads(1L * m * n); | |||
if (nthreads == 1) { | |||
#endif | |||
@@ -107,21 +107,35 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, | |||
#ifndef PPC440 | |||
buffer = (FLOAT *)blas_memory_alloc(1); | |||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
#endif | |||
#ifdef SMP | |||
args.common = NULL; | |||
#ifndef DOUBLE | |||
if (args.m*args.n < 40000) | |||
#if defined(_WIN64) && defined(_M_ARM64) | |||
#ifdef COMPLEX | |||
if (args.m * args.n <= 300) | |||
#else | |||
if (args.m * args.n <= 500) | |||
#endif | |||
args.nthreads = 1; | |||
else if (args.m * args.n <= 1000) | |||
args.nthreads = 4; | |||
else | |||
args.nthreads = num_cpu_avail(4); | |||
#else | |||
if (args.m*args.n < 10000) | |||
#ifndef DOUBLE | |||
if (args.m * args.n < 40000) | |||
#else | |||
if (args.m * args.n < 10000) | |||
#endif | |||
args.nthreads = 1; | |||
else | |||
args.nthreads = num_cpu_avail(4); | |||
#endif | |||
args.nthreads=1; | |||
else | |||
args.nthreads = num_cpu_avail(4); | |||
if (args.nthreads == 1) { | |||
#endif | |||
@@ -61,6 +61,37 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
#else | |||
return fabsf(x[0]); | |||
#endif | |||
#endif | |||
if (incx == 0) | |||
#ifndef COMPLEX | |||
#ifdef DOUBLE | |||
return (sqrt((double)n)*fabs(x[0])); | |||
#else | |||
return (sqrt((float)n)*fabsf(x[0])); | |||
#endif | |||
#else | |||
#ifdef DOUBLE | |||
{ | |||
double fr=fabs(x[0]); | |||
double fi=fabs(x[1]); | |||
double fmin=MIN(fr,fi); | |||
double fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||
return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#else | |||
{ | |||
float fr=fabs(x[0]); | |||
float fi=fabs(x[1]); | |||
float fmin=MIN(fr,fi); | |||
float fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||
return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#endif | |||
#endif | |||
if (incx < 0) | |||
@@ -97,13 +128,44 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
if (n <= 0) return 0.; | |||
#ifndef COMPLEX | |||
#ifndef COMPLEX | |||
if (n == 1) | |||
#ifdef DOUBLE | |||
return fabs(x[0]); | |||
#else | |||
return fabsf(x[0]); | |||
#endif | |||
#endif | |||
if (incx == 0) | |||
#ifndef COMPLEX | |||
#ifdef DOUBLE | |||
return (sqrt((double)n)*fabs(x[0])); | |||
#else | |||
return (sqrt((float)n)*fabsf(x[0])); | |||
#endif | |||
#else | |||
#ifdef DOUBLE | |||
{ | |||
double fr=fabs(x[0]); | |||
double fi=fabs(x[1]); | |||
double fmin=MIN(fr,fi); | |||
double fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); | |||
return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#else | |||
{ | |||
float fr=fabs(x[0]); | |||
float fi=fabs(x[1]); | |||
float fmin=MIN(fr,fi); | |||
float fmax=MAX(fr,fi); | |||
if (fmax==0.) return(fmax); | |||
if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); | |||
return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); | |||
} | |||
#endif | |||
#endif | |||
if (incx < 0) | |||
@@ -7,149 +7,21 @@ | |||
void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | |||
blasint n = *N; | |||
blasint incx = *INCX; | |||
blasint incy = *INCY; | |||
blasint n = *N; | |||
blasint incx = *INCX; | |||
blasint incy = *INCY; | |||
PRINT_DEBUG_NAME | |||
#else | |||
void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | |||
#endif | |||
blasint i__1, i__2; | |||
PRINT_DEBUG_CNAME; | |||
blasint i__; | |||
FLOAT w, z__; | |||
blasint kx, ky; | |||
FLOAT dh11, dh12, dh22, dh21, dflag; | |||
blasint nsteps; | |||
#ifndef CBLAS | |||
PRINT_DEBUG_CNAME; | |||
#else | |||
PRINT_DEBUG_CNAME; | |||
#endif | |||
--dparam; | |||
--dy; | |||
--dx; | |||
dflag = dparam[1]; | |||
if (n <= 0 || dflag == - 2.0) goto L140; | |||
if (! (incx == incy && incx > 0)) goto L70; | |||
nsteps = n * incx; | |||
if (dflag < 0.) { | |||
goto L50; | |||
} else if (dflag == 0) { | |||
goto L10; | |||
} else { | |||
goto L30; | |||
} | |||
L10: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w + z__ * dh12; | |||
dy[i__] = w * dh21 + z__; | |||
/* L20: */ | |||
} | |||
goto L140; | |||
L30: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = nsteps; | |||
i__1 = incx; | |||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__; | |||
dy[i__] = -w + dh22 * z__; | |||
/* L40: */ | |||
} | |||
goto L140; | |||
L50: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__ * dh12; | |||
dy[i__] = w * dh21 + z__ * dh22; | |||
/* L60: */ | |||
} | |||
goto L140; | |||
L70: | |||
kx = 1; | |||
ky = 1; | |||
if (incx < 0) { | |||
kx = (1 - n) * incx + 1; | |||
} | |||
if (incy < 0) { | |||
ky = (1 - n) * incy + 1; | |||
} | |||
ROTM_K(n, dx, incx, dy, incy, dparam); | |||
if (dflag < 0.) { | |||
goto L120; | |||
} else if (dflag == 0) { | |||
goto L80; | |||
} else { | |||
goto L100; | |||
} | |||
L80: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w + z__ * dh12; | |||
dy[ky] = w * dh21 + z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L90: */ | |||
} | |||
goto L140; | |||
L100: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__; | |||
dy[ky] = -w + dh22 * z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L110: */ | |||
} | |||
goto L140; | |||
L120: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__ * dh12; | |||
dy[ky] = w * dh21 + z__ * dh22; | |||
kx += incx; | |||
ky += incy; | |||
/* L130: */ | |||
} | |||
L140: | |||
return; | |||
} | |||
@@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order, | |||
#ifdef SMP | |||
if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) | |||
#if defined(_WIN64) && defined(_M_ARM64) | |||
if (m*n > 25000000L) | |||
nthreads = num_cpu_avail(4); | |||
else | |||
nthreads = 1; | |||
#else | |||
if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(2); | |||
#endif | |||
if (nthreads == 1) { | |||
#endif | |||
#endif | |||
(gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); | |||
#ifdef SMP | |||
} else { | |||
(gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); | |||
} | |||
#endif | |||
STACK_FREE(buffer); | |||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | |||
@@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
#else | |||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
FLOAT *buffer; | |||
int uplo; | |||
blasint info; | |||
FLOAT * ALPHA = α | |||
FLOAT * ALPHA = (FLOAT*)valpha; | |||
FLOAT alpha_r = ALPHA[0]; | |||
FLOAT alpha_i = ALPHA[1]; | |||
#ifdef SMP | |||
@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | |||
@@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | |||
GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | |||
GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | |||
GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") | |||
endif () | |||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | |||
@@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | |||
@@ -198,25 +201,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
# Makefile.L3 | |||
set(USE_TRMM false) | |||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||
if (ARM OR ARM64 OR RISCV64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||
set(USE_TRMM true) | |||
endif () | |||
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||
set(USE_TRMM true) | |||
endif () | |||
set(USE_DIRECT_SGEMM false) | |||
if (X86_64) | |||
if (X86_64 OR ARM64) | |||
set(USE_DIRECT_SGEMM true) | |||
endif() | |||
if (USE_DIRECT_SGEMM) | |||
# if (NOT DEFINED SGEMMDIRECTKERNEL) | |||
if (X86_64) | |||
set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) | |||
set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | |||
# endif() | |||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||
elseif (ARM64) | |||
set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) | |||
set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) | |||
set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) | |||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||
if (HAVE_SME) | |||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) | |||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) | |||
endif () | |||
endif () | |||
endif() | |||
foreach (float_type SINGLE DOUBLE) | |||
@@ -1105,6 +1118,7 @@ endif () | |||
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | |||
@@ -1352,6 +1366,9 @@ endif () | |||
if (USE_GEMM3M) | |||
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | |||
endif() | |||
if (USE_OPENMP) | |||
target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C) | |||
endif() | |||
endfunction () | |||
@@ -24,7 +24,11 @@ ifdef NO_AVX2 | |||
AVX2OPT= | |||
endif | |||
ifdef TARGET_CORE | |||
ifeq ($(TARGET_CORE), ARMV9SME) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme | |||
endif | |||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | |||
@@ -336,6 +336,18 @@ ifndef XROTKERNEL | |||
XROTKERNEL = zrot.S | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = rotm.S | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = rotm.S | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = rotm.S | |||
endif | |||
### SCAL ### | |||
ifndef SSCALKERNEL | |||
@@ -504,21 +516,21 @@ SBLASOBJS += \ | |||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
saxpby_k$(TSUFFIX).$(SUFFIX) | |||
saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) | |||
DBLASOBJS += \ | |||
damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | |||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) | |||
QBLASOBJS += \ | |||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
qsum_k$(TSUFFIX).$(SUFFIX) | |||
qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) | |||
CBLASOBJS += \ | |||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
@@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||
$(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | |||
@@ -24,6 +24,7 @@ endif | |||
ifeq ($(ARCH), arm64) | |||
USE_TRMM = 1 | |||
USE_DIRECT_SGEMM = 1 | |||
endif | |||
ifeq ($(ARCH), riscv64) | |||
@@ -95,9 +96,17 @@ endif | |||
ifdef USE_DIRECT_SGEMM | |||
ifndef SGEMMDIRECTKERNEL | |||
ifeq ($(ARCH), x86_64) | |||
SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c | |||
SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | |||
endif | |||
ifeq ($(ARCH), arm64) | |||
ifeq ($(TARGET_CORE), ARMV9SME) | |||
HAVE_SME = 1 | |||
endif | |||
SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c | |||
endif | |||
endif | |||
endif | |||
ifeq ($(BUILD_BFLOAT16), 1) | |||
@@ -128,9 +137,20 @@ SKERNELOBJS += \ | |||
$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | |||
ifdef USE_DIRECT_SGEMM | |||
ifeq ($(ARCH), x86_64) | |||
SKERNELOBJS += \ | |||
sgemm_direct$(TSUFFIX).$(SUFFIX) \ | |||
sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||
sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ifeq ($(ARCH), arm64) | |||
SKERNELOBJS += \ | |||
sgemm_direct$(TSUFFIX).$(SUFFIX) | |||
ifdef HAVE_SME | |||
SKERNELOBJS += \ | |||
sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ | |||
sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) | |||
endif | |||
endif | |||
endif | |||
endif | |||
@@ -809,11 +829,23 @@ else | |||
endif | |||
ifdef USE_DIRECT_SGEMM | |||
ifeq ($(ARCH), x86_64) | |||
$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | |||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
endif | |||
ifeq ($(ARCH), arm64) | |||
$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
ifdef HAVE_SME | |||
$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : | |||
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ | |||
$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : | |||
$(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ | |||
endif | |||
endif | |||
endif | |||
ifeq ($(BUILD_BFLOAT16), 1) | |||
@@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -45,4 +45,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -1,6 +1,6 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
SGEMVNKERNEL = gemv_n_sve.c | |||
DGEMVNKERNEL = gemv_n_sve.c | |||
SGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
DGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
SGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v4x3.c |
@@ -74,16 +74,21 @@ DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
SGEMVNKERNEL = gemv_n.S | |||
DGEMVNKERNEL = gemv_n.S | |||
SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
CGEMVNKERNEL = zgemv_n.S | |||
ZGEMVNKERNEL = zgemv_n.S | |||
SGEMVTKERNEL = gemv_t.S | |||
DGEMVTKERNEL = gemv_t.S | |||
SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
SASUMKERNEL = sasum_thunderx2t99.c | |||
DASUMKERNEL = dasum_thunderx2t99.c | |||
CASUMKERNEL = casum_thunderx2t99.c | |||
@@ -0,0 +1,3 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
@@ -60,7 +60,7 @@ DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
SGEMVNKERNEL = gemv_n.S | |||
SGEMVNKERNEL = sgemv_n_neon.c | |||
DGEMVNKERNEL = gemv_n.S | |||
CGEMVNKERNEL = zgemv_n.S | |||
ZGEMVNKERNEL = zgemv_n.S | |||
@@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
SASUMKERNEL = sasum_thunderx2t99.c | |||
DASUMKERNEL = dasum_thunderx2t99.c | |||
@@ -98,8 +102,18 @@ ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
ifeq ($(OSNAME), WINNT) | |||
ifeq ($(C_COMPILER), CLANG) | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
else | |||
CDOTKERNEL = zdot_thunderx2t99.c | |||
ZDOTKERNEL = zdot_thunderx2t99.c | |||
endif | |||
else | |||
CDOTKERNEL = zdot_thunderx2t99.c | |||
ZDOTKERNEL = zdot_thunderx2t99.c | |||
endif | |||
DSDOTKERNEL = dot.S | |||
DGEMM_BETA = dgemm_beta.S | |||
@@ -60,13 +60,13 @@ DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
SGEMVNKERNEL = gemv_n.S | |||
DGEMVNKERNEL = gemv_n.S | |||
SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
CGEMVNKERNEL = zgemv_n.S | |||
ZGEMVNKERNEL = zgemv_n.S | |||
SGEMVTKERNEL = gemv_t.S | |||
DGEMVTKERNEL = gemv_t.S | |||
SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
@@ -198,3 +198,5 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||
SBGEMVNKERNEL = sbgemv_n_neon.c |
@@ -1,4 +1,24 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
ifeq ($(BUILD_BFLOAT16), 1) | |||
SBGEMM_BETA = sbgemm_beta_neoversev1.c | |||
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c | |||
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) | |||
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c | |||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c | |||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
SBGEMVNKERNEL = sbgemv_n_neon.c | |||
SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||
endif |
@@ -1 +1,6 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
ifeq ($(BUILD_BFLOAT16), 1) | |||
SBGEMVTKERNEL = sbgemv_t_bfdot.c | |||
SBGEMVNKERNEL = sbgemv_n_neon.c | |||
endif |
@@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -1,216 +1,217 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2017, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#define N "x0" /* vector length */ | |||
#define X "x1" /* X vector address */ | |||
#define INC_X "x2" /* X stride */ | |||
#define Y "x3" /* Y vector address */ | |||
#define INC_Y "x4" /* Y stride */ | |||
#define J "x5" /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#if !defined(COMPLEX) | |||
#if !defined(DOUBLE) | |||
#define TMPF "s0" | |||
#define INC_SHIFT "2" | |||
#define N_DIV_SHIFT "2" | |||
#define N_REM_MASK "3" | |||
#else | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#endif | |||
#else | |||
#if !defined(DOUBLE) | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#else | |||
#define TMPF "q0" | |||
#define INC_SHIFT "4" | |||
#define N_DIV_SHIFT "0" | |||
#define N_REM_MASK "0" | |||
#endif | |||
#endif | |||
#define KERNEL_F1 \ | |||
"ldr "TMPF", ["X"] \n" \ | |||
"add "X", "X", "INC_X" \n" \ | |||
"str "TMPF", ["Y"] \n" \ | |||
"add "Y", "Y", "INC_Y" \n" | |||
#define KERNEL_F \ | |||
"ldr q0, ["X"], #16 \n" \ | |||
"str q0, ["Y"], #16 \n" | |||
#define INIT \ | |||
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||
"lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||
static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
if ( n < 0 ) return 0; | |||
__asm__ __volatile__ ( | |||
" mov "N", %[N_] \n" | |||
" mov "X", %[X_] \n" | |||
" mov "INC_X", %[INCX_] \n" | |||
" mov "Y", %[Y_] \n" | |||
" mov "INC_Y", %[INCY_] \n" | |||
" cmp "N", xzr \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
"// .Lcopy_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq 2f //copy_kernel_F1 \n" | |||
" .align 5 \n" | |||
"1: //copy_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 1b //copy_kernel_F \n" | |||
"2: //copy_kernel_F1: \n" | |||
#if defined(COMPLEX) && defined(DOUBLE) | |||
" b 8f //copy_kernel_L999 \n" | |||
#else | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
#endif | |||
"3: //copy_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 3b //copy_kernel_F10 \n" | |||
" b 8f //copy_kernel_L999 \n" | |||
"4: //copy_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble 6f //copy_kernel_S1 \n" | |||
"5: //copy_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 5b //copy_kernel_S4 \n" | |||
"6: //copy_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
"7: //copy_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 7b //copy_kernel_S10 \n" | |||
"8: //copy_kernel_L999: \n" | |||
: | |||
: [N_] "r" (n), //%1 | |||
[X_] "r" (x), //%2 | |||
[INCX_] "r" (inc_x), //%3 | |||
[Y_] "r" (y), //%4 | |||
[INCY_] "r" (inc_y) //%5 | |||
: "cc", | |||
"memory", | |||
"x0", "x1", "x2", "x3", "x4", "x5", | |||
"d0" | |||
); | |||
return 0; | |||
} | |||
#if defined(SMP) | |||
static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||
{ | |||
do_copy(n, x, inc_x, y, inc_y); | |||
return 0; | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
#if defined(SMP) | |||
int nthreads; | |||
FLOAT dummy_alpha; | |||
#endif | |||
if (n <= 0) return 0; | |||
#if defined(SMP) | |||
if (inc_x == 0 || n <= 10000) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(1); | |||
if (nthreads == 1) { | |||
do_copy(n, x, inc_x, y, inc_y); | |||
} else { | |||
int mode = 0; | |||
#if !defined(COMPLEX) | |||
mode = BLAS_REAL; | |||
#else | |||
mode = BLAS_COMPLEX; | |||
#endif | |||
#if !defined(DOUBLE) | |||
mode |= BLAS_SINGLE; | |||
#else | |||
mode |= BLAS_DOUBLE; | |||
#endif | |||
blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||
x, inc_x, y, inc_y, NULL, 0, | |||
( void *)copy_thread_function, nthreads); | |||
} | |||
#else | |||
do_copy(n, x, inc_x, y, inc_y); | |||
#endif | |||
return 0; | |||
} | |||
/*************************************************************************** | |||
Copyright (c) 2017, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#define N "x0" /* vector length */ | |||
#define X "x1" /* X vector address */ | |||
#define INC_X "x2" /* X stride */ | |||
#define Y "x3" /* Y vector address */ | |||
#define INC_Y "x4" /* Y stride */ | |||
#define J "x5" /* loop variable */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#if !defined(COMPLEX) | |||
#if !defined(DOUBLE) | |||
#define TMPF "s0" | |||
#define INC_SHIFT "2" | |||
#define N_DIV_SHIFT "2" | |||
#define N_REM_MASK "3" | |||
#else | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#endif | |||
#else | |||
#if !defined(DOUBLE) | |||
#define TMPF "d0" | |||
#define INC_SHIFT "3" | |||
#define N_DIV_SHIFT "1" | |||
#define N_REM_MASK "1" | |||
#else | |||
#define TMPF "q0" | |||
#define INC_SHIFT "4" | |||
#define N_DIV_SHIFT "0" | |||
#define N_REM_MASK "0" | |||
#endif | |||
#endif | |||
#define KERNEL_F1 \ | |||
"ldr "TMPF", ["X"] \n" \ | |||
"add "X", "X", "INC_X" \n" \ | |||
"str "TMPF", ["Y"] \n" \ | |||
"add "Y", "Y", "INC_Y" \n" | |||
#define KERNEL_F \ | |||
"ldr q0, ["X"], #16 \n" \ | |||
"str q0, ["Y"], #16 \n" | |||
#define INIT \ | |||
"lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ | |||
"lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" | |||
static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
if ( n < 0 ) return 0; | |||
__asm__ __volatile__ ( | |||
" mov "N", %[N_] \n" | |||
" mov "X", %[X_] \n" | |||
" mov "INC_X", %[INCX_] \n" | |||
" mov "Y", %[Y_] \n" | |||
" mov "INC_Y", %[INCY_] \n" | |||
" cmp "N", xzr \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
"// .Lcopy_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq 2f //copy_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
" .align 5 \n" | |||
#endif | |||
"1: //copy_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 1b //copy_kernel_F \n" | |||
"2: //copy_kernel_F1: \n" | |||
#if defined(COMPLEX) && defined(DOUBLE) | |||
" b 8f //copy_kernel_L999 \n" | |||
#else | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
#endif | |||
"3: //copy_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 3b //copy_kernel_F10 \n" | |||
" b 8f //copy_kernel_L999 \n" | |||
"4: //copy_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble 6f //copy_kernel_S1 \n" | |||
"5: //copy_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 5b //copy_kernel_S4 \n" | |||
"6: //copy_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
"7: //copy_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne 7b //copy_kernel_S10 \n" | |||
"8: //copy_kernel_L999: \n" | |||
: | |||
: [N_] "r" (n), //%1 | |||
[X_] "r" (x), //%2 | |||
[INCX_] "r" (inc_x), //%3 | |||
[Y_] "r" (y), //%4 | |||
[INCY_] "r" (inc_y) //%5 | |||
: "cc", | |||
"memory", | |||
"x0", "x1", "x2", "x3", "x4", "x5", | |||
"d0" | |||
); | |||
return 0; | |||
} | |||
#if defined(SMP) | |||
static int copy_thread_function(BLASLONG n, BLASLONG dummy0, | |||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) | |||
{ | |||
do_copy(n, x, inc_x, y, inc_y); | |||
return 0; | |||
} | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
#if defined(SMP) | |||
int nthreads; | |||
FLOAT dummy_alpha; | |||
#endif | |||
if (n <= 0) return 0; | |||
#if defined(SMP) | |||
if (inc_x == 0 || n <= 10000) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(1); | |||
if (nthreads == 1) { | |||
do_copy(n, x, inc_x, y, inc_y); | |||
} else { | |||
int mode = 0; | |||
#if !defined(COMPLEX) | |||
mode = BLAS_REAL; | |||
#else | |||
mode = BLAS_COMPLEX; | |||
#endif | |||
#if !defined(DOUBLE) | |||
mode |= BLAS_SINGLE; | |||
#else | |||
mode |= BLAS_DOUBLE; | |||
#endif | |||
blas_level1_thread(mode, n, 0, 0, &dummy_alpha, | |||
x, inc_x, y, inc_y, NULL, 0, | |||
( void *)copy_thread_function, nthreads); | |||
} | |||
#else | |||
do_copy(n, x, inc_x, y, inc_y); | |||
#endif | |||
return 0; | |||
} |
@@ -152,7 +152,9 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" cmp "J", xzr \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
".align 5 \n" | |||
#endif | |||
"2: //asum_kernel_F32: \n" | |||
" "KERNEL_F32" \n" | |||
" subs "J", "J", #1 \n" | |||
@@ -213,7 +213,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n2 = N & -2; | |||
const BLASLONG n8 = N & -8; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -219,7 +219,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n4 = N & -4; | |||
const BLASLONG n2 = N & -2; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
#ifdef DYNAMIC_ARCH | |||
extern char* gotoblas_corename(void); | |||
#endif | |||
#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { | |||
#ifdef DOUBLE | |||
return (N <= 10000L) ? 1 | |||
: (N <= 64500L) ? 1 | |||
: (N <= 100000L) ? MIN(ncpu, 2) | |||
: (N <= 150000L) ? MIN(ncpu, 4) | |||
: (N <= 260000L) ? MIN(ncpu, 8) | |||
: (N <= 360000L) ? MIN(ncpu, 16) | |||
: (N <= 520000L) ? MIN(ncpu, 24) | |||
: (N <= 1010000L) ? MIN(ncpu, 56) | |||
: ncpu; | |||
#else | |||
return (N <= 10000L) ? 1 | |||
: (N <= 110000L) ? 1 | |||
: (N <= 200000L) ? MIN(ncpu, 2) | |||
: (N <= 280000L) ? MIN(ncpu, 4) | |||
: (N <= 520000L) ? MIN(ncpu, 8) | |||
: (N <= 830000L) ? MIN(ncpu, 16) | |||
: (N <= 1010000L) ? MIN(ncpu, 24) | |||
: ncpu; | |||
#endif | |||
} | |||
#endif | |||
static inline int get_dot_optimal_nthreads(BLASLONG n) { | |||
int ncpu = num_cpu_avail(1); | |||
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
} | |||
#endif | |||
// Default case | |||
if (n <= 10000L) | |||
return 1; | |||
else | |||
return num_cpu_avail(1); | |||
} | |||
#endif | |||
static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
@@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
RETURN_TYPE dot = 0.0; | |||
#if defined(SMP) | |||
if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
if (inc_x == 0 || inc_y == 0) | |||
nthreads = 1; | |||
else | |||
nthreads = num_cpu_avail(1); | |||
nthreads = get_dot_optimal_nthreads(n); | |||
if (nthreads == 1) { | |||
dot = dot_compute(n, x, inc_x, y, inc_y); | |||
@@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
x, inc_x, y, inc_y, result, 0, | |||
( void *)dot_thread_function, nthreads); | |||
(void *)dot_thread_function, nthreads); | |||
ptr = (RETURN_TYPE *)result; | |||
for (i = 0; i < nthreads; i++) { | |||
@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
" fadd v4.4s, v4.4s, v6.4s \n" \ | |||
" fadd v0.4s, v0.4s, v4.4s \n" \ | |||
" faddp v0.4s, v0.4s, v0.4s \n" \ | |||
" faddp v0.4s, v0.4s, v0.4s \n" | |||
" faddp "OUT", v0.2s \n" | |||
#else /* !defined(DSDOT) */ | |||
#define KERNEL_F1 \ | |||
@@ -285,8 +285,9 @@ static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT | |||
" asr %[J_], %[N_], #"N_DIV_SHIFT" \n" | |||
" cmp %[J_], xzr \n" | |||
" beq 3f //dot_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
" .align 5 \n" | |||
#endif | |||
"2: //dot_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs %[J_], %[J_], #1 \n" | |||
@@ -1,5 +1,5 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2024, The OpenBLAS Project | |||
Copyright (c) 2024-2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
@@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
a_ptr = a; | |||
if (inc_y == 1) { | |||
BLASLONG width = n / 3; | |||
uint64_t sve_size = SV_COUNT(); | |||
for (j = 0; j < n; j++) { | |||
SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); | |||
i = 0; | |||
svbool_t pg = SV_WHILE(i, m); | |||
while (svptest_any(SV_TRUE(), pg)) { | |||
SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
svbool_t pg_true = SV_TRUE(); | |||
svbool_t pg = SV_WHILE(0, m % sve_size); | |||
FLOAT *a0_ptr = a + lda * width * 0; | |||
FLOAT *a1_ptr = a + lda * width * 1; | |||
FLOAT *a2_ptr = a + lda * width * 2; | |||
for (j = 0; j < width; j++) { | |||
for (i = 0; (i + sve_size - 1) < m; i += sve_size) { | |||
ix = j * inc_x; | |||
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); | |||
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); | |||
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); | |||
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||
SV_TYPE y_vec = svld1(pg_true, y + i); | |||
y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0); | |||
y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0); | |||
y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0); | |||
svst1(pg_true, y + i, y_vec); | |||
} | |||
if (i < m) { | |||
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); | |||
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); | |||
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); | |||
SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||
SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||
SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||
SV_TYPE y_vec = svld1(pg, y + i); | |||
y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); | |||
y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec); | |||
y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec); | |||
y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec); | |||
ix += inc_x; | |||
svst1(pg, y + i, y_vec); | |||
i += sve_size; | |||
pg = SV_WHILE(i, m); | |||
} | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
} | |||
a_ptr = a2_ptr; | |||
for (j = width * 3; j < n; j++) { | |||
ix = j * inc_x; | |||
for (i = 0; (i + sve_size - 1) < m; i += sve_size) { | |||
SV_TYPE y_vec = svld1(pg_true, y + i); | |||
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); | |||
SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||
y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); | |||
svst1(pg_true, y + i, y_vec); | |||
} | |||
if (i < m) { | |||
SV_TYPE y_vec = svld1(pg, y + i); | |||
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); | |||
SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
y_vec = svmla_m(pg, y_vec, a_vec, x_vec); | |||
svst1(pg, y + i, y_vec); | |||
} | |||
a_ptr += lda; | |||
ix += inc_x; | |||
} | |||
return(0); | |||
return (0); | |||
} | |||
for (j = 0; j < n; j++) { | |||
@@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
ix += inc_x; | |||
} | |||
return (0); | |||
} | |||
} |
@@ -0,0 +1,138 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT temp; | |||
ix = 0; | |||
a_ptr = a; | |||
if (inc_y == 1) { | |||
BLASLONG width = (n + 3 - 1) / 3; | |||
FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
FLOAT *x0_ptr = x + inc_x * width * 0; | |||
FLOAT *x1_ptr = x + inc_x * width * 1; | |||
FLOAT *x2_ptr = x + inc_x * width * 2; | |||
for (j = 0; j < width; j++) { | |||
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
i = 0; | |||
BLASLONG sve_size = SV_COUNT(); | |||
while ((i + sve_size * 1 - 1) < m) { | |||
SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
i += sve_size * 1; | |||
} | |||
if (i < m) { | |||
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
svst1_vnum(pg0, y + i, 0, y0_vec); | |||
} | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
ix += inc_x; | |||
} | |||
return(0); | |||
} | |||
for (j = 0; j < n; j++) { | |||
temp = alpha * x[ix]; | |||
iy = 0; | |||
for (i = 0; i < m; i++) { | |||
y[iy] += temp * a_ptr[i]; | |||
iy += inc_y; | |||
} | |||
a_ptr += lda; | |||
ix += inc_x; | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,207 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT temp; | |||
ix = 0; | |||
a_ptr = a; | |||
if (inc_y == 1) { | |||
BLASLONG width = (n + 3 - 1) / 3; | |||
FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
FLOAT *x0_ptr = x + inc_x * width * 0; | |||
FLOAT *x1_ptr = x + inc_x * width * 1; | |||
FLOAT *x2_ptr = x + inc_x * width * 2; | |||
for (j = 0; j < width; j++) { | |||
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
i = 0; | |||
BLASLONG sve_size = SV_COUNT(); | |||
while ((i + sve_size * 4 - 1) < m) { | |||
SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); | |||
SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); | |||
SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); | |||
svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); | |||
svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); | |||
i += sve_size * 4; | |||
} | |||
if (i < m) { | |||
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||
svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||
svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||
pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||
pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||
pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||
pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||
pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||
pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||
pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||
pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||
pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||
SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); | |||
SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); | |||
SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
svst1_vnum(pg0, y + i, 0, y0_vec); | |||
svst1_vnum(pg1, y + i, 1, y1_vec); | |||
svst1_vnum(pg2, y + i, 2, y2_vec); | |||
svst1_vnum(pg3, y + i, 3, y3_vec); | |||
} | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
ix += inc_x; | |||
} | |||
return(0); | |||
} | |||
for (j = 0; j < n; j++) { | |||
temp = alpha * x[ix]; | |||
iy = 0; | |||
for (i = 0; i < m; i++) { | |||
y[iy] += temp * a_ptr[i]; | |||
iy += inc_y; | |||
} | |||
a_ptr += lda; | |||
ix += inc_x; | |||
} | |||
return (0); | |||
} |
@@ -1,5 +1,5 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2024, The OpenBLAS Project | |||
Copyright (c) 2024, 2025 The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
@@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT *y_ptr; | |||
FLOAT temp; | |||
iy = 0; | |||
if (inc_x == 1) { | |||
BLASLONG width = (n + 3 - 1) / 3; | |||
BLASLONG width = n / 3; | |||
BLASLONG sve_size = SV_COUNT(); | |||
svbool_t pg_true = SV_TRUE(); | |||
svbool_t pg = SV_WHILE(0, m % sve_size); | |||
FLOAT *a0_ptr = a + lda * width * 0; | |||
FLOAT *a1_ptr = a + lda * width * 1; | |||
@@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
FLOAT *y2_ptr = y + inc_y * width * 2; | |||
for (j = 0; j < width; j++) { | |||
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
SV_TYPE temp00_vec = SV_DUP(0.0); | |||
SV_TYPE temp01_vec = SV_DUP(0.0); | |||
SV_TYPE temp02_vec = SV_DUP(0.0); | |||
i = 0; | |||
BLASLONG sve_size = SV_COUNT(); | |||
while ((i + sve_size * 1 - 1) < m) { | |||
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); | |||
SV_TYPE x0_vec = svld1(pg_true, x + i); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); | |||
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); | |||
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec); | |||
i += sve_size * 1; | |||
} | |||
if (i < m) { | |||
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
SV_TYPE x0_vec = svld1(pg, x + i); | |||
SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); | |||
SV_TYPE a00_vec = svld1(pg, a0_ptr + i); | |||
SV_TYPE a01_vec = svld1(pg, a1_ptr + i); | |||
SV_TYPE a02_vec = svld1(pg, a2_ptr + i); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec); | |||
} | |||
if ((j + width * 0) < n) { | |||
temp = svaddv(SV_TRUE(), temp00_vec); | |||
y0_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 1) < n) { | |||
temp = svaddv(SV_TRUE(), temp01_vec); | |||
y1_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 2) < n) { | |||
temp = svaddv(SV_TRUE(), temp02_vec); | |||
y2_ptr[iy] += alpha * temp; | |||
} | |||
y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec); | |||
y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec); | |||
y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec); | |||
iy += inc_y; | |||
a0_ptr += lda; | |||
@@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
a2_ptr += lda; | |||
} | |||
a_ptr = a2_ptr; | |||
y_ptr = y2_ptr; | |||
for (j = width * 3; j < n; j++) { | |||
SV_TYPE temp_vec = SV_DUP(0.0); | |||
i = 0; | |||
while ((i + sve_size * 1 - 1) < m) { | |||
SV_TYPE x_vec = svld1(pg_true, x + i); | |||
SV_TYPE a_vec = svld1(pg_true, a_ptr + i); | |||
temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec); | |||
i += sve_size * 1; | |||
} | |||
if (i < m) { | |||
SV_TYPE x_vec = svld1(pg, x + i); | |||
SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); | |||
} | |||
y_ptr[iy] += alpha * svaddv(pg_true, temp_vec); | |||
iy += inc_y; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} | |||
@@ -153,8 +153,9 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" asr "J", "N", #6 \n" | |||
" cmp "J", xzr \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
#if !(defined(__clang__) && defined(OS_WINDOWS)) | |||
".align 5 \n" | |||
#endif | |||
"2: //asum_kernel_F64: \n" | |||
" "KERNEL_F64" \n" | |||
" subs "J", "J", #1 \n" | |||
@@ -0,0 +1,83 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, | |||
BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, | |||
BLASLONG ldc) { | |||
BLASLONG i, j; | |||
BLASLONG chunk, remain; | |||
FLOAT *c_offset1, *c_offset; | |||
c_offset = c; | |||
chunk = m >> 3; | |||
remain = m & 7; | |||
if (beta == ZERO) { | |||
for (j = n; j > 0; j--) { | |||
c_offset1 = c_offset; | |||
c_offset += ldc; | |||
for (i = chunk; i > 0; i--) { | |||
*(c_offset1 + 0) = ZERO; | |||
*(c_offset1 + 1) = ZERO; | |||
*(c_offset1 + 2) = ZERO; | |||
*(c_offset1 + 3) = ZERO; | |||
*(c_offset1 + 4) = ZERO; | |||
*(c_offset1 + 5) = ZERO; | |||
*(c_offset1 + 6) = ZERO; | |||
*(c_offset1 + 7) = ZERO; | |||
c_offset1 += 8; | |||
} | |||
for (i = remain; i > 0; i--) { | |||
*c_offset1 = ZERO; | |||
c_offset1++; | |||
} | |||
} | |||
} else { | |||
for (j = n; j > 0; j--) { | |||
c_offset1 = c_offset; | |||
c_offset += ldc; | |||
for (i = chunk; i > 0; i--) { | |||
*(c_offset1 + 0) *= beta; | |||
*(c_offset1 + 1) *= beta; | |||
*(c_offset1 + 2) *= beta; | |||
*(c_offset1 + 3) *= beta; | |||
*(c_offset1 + 4) *= beta; | |||
*(c_offset1 + 5) *= beta; | |||
*(c_offset1 + 6) *= beta; | |||
*(c_offset1 + 7) *= beta; | |||
c_offset1 += 8; | |||
} | |||
for (i = remain; i > 0; i--) { | |||
*c_offset1 *= beta; | |||
c_offset1++; | |||
} | |||
} | |||
} | |||
return 0; | |||
}; |
@@ -0,0 +1,46 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#define ALPHA_ONE | |||
#include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||
#undef ALPHA_ONE | |||
#include "sbgemm_kernel_4x4_neoversev1_impl.c" | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
FLOAT *C, BLASLONG ldc) { | |||
if (alpha == 1.0f) | |||
return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||
else | |||
return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc); | |||
return 0; | |||
} | |||
@@ -0,0 +1,414 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#define INIT_C(M, N) mc##M##N = svdup_f32(0); | |||
#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); | |||
#define INIT_C_4x4 \ | |||
do { \ | |||
INIT_C(0, 0); \ | |||
INIT_C(0, 1); \ | |||
INIT_C(1, 0); \ | |||
INIT_C(1, 1); \ | |||
} while (0); | |||
#ifdef ALPHA_ONE | |||
#define UPDATE_C(PG, PTR, DST, SRC) \ | |||
do { \ | |||
DST = svld1_f32((PG), (PTR)); \ | |||
DST = svadd_z((PG), SRC, DST); \ | |||
svst1_f32((PG), (PTR), DST); \ | |||
} while (0); | |||
#else | |||
#define UPDATE_C(PG, PTR, DST, SRC) \ | |||
do { \ | |||
DST = svld1_f32((PG), (PTR)); \ | |||
DST = svmad_z((PG), svalpha, SRC, DST); \ | |||
svst1_f32((PG), (PTR), DST); \ | |||
} while (0); | |||
#endif | |||
#define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||
do { \ | |||
(tmp) = svuzp1_f32((mc0), (mc1)); \ | |||
(vc) = svcompact_f32((PG), (tmp)); \ | |||
} while (0) | |||
#define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \ | |||
do { \ | |||
(tmp) = svuzp2_f32((mc0), (mc1)); \ | |||
(vc) = svcompact_f32((PG), (tmp)); \ | |||
} while (0) | |||
#define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \ | |||
do { \ | |||
TMP = svext_f32(mc##M##N, mc##M##N, 4); \ | |||
mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \ | |||
} while (0) | |||
#ifdef ALPHA_ONE | |||
int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, | |||
FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
FLOAT *C, BLASLONG ldc) | |||
#else | |||
int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, | |||
FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, | |||
BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG pad_k = (k + 7) & ~7; | |||
svbfloat16_t ma0, ma1, mb0, mb1; | |||
svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3; | |||
svfloat32_t tmp; | |||
svfloat32_t svalpha = svdup_f32(alpha); | |||
svbool_t pg16_all = svptrue_b16(); | |||
svbool_t pg32_first_1 = svwhilelt_b32(0, 1); | |||
svbool_t pg32_first_2 = svwhilelt_b32(0, 2); | |||
svbool_t pg32_first_4 = svwhilelt_b32(0, 4); | |||
svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0); | |||
bfloat16_t *ptr_a = (bfloat16_t *)A; | |||
bfloat16_t *ptr_b = (bfloat16_t *)B; | |||
FLOAT *ptr_c = C; | |||
bfloat16_t *ptr_a0; | |||
bfloat16_t *ptr_b0; | |||
FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; | |||
for (BLASLONG j = 0; j < n / 4; j++) { | |||
ptr_c0 = ptr_c; | |||
ptr_c1 = ptr_c0 + ldc; | |||
ptr_c2 = ptr_c1 + ldc; | |||
ptr_c3 = ptr_c2 + ldc; | |||
ptr_c += 4 * ldc; | |||
ptr_a = (bfloat16_t *)A; | |||
for (BLASLONG i = 0; i < m / 4; i++) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 4 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C_4x4; | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
MATMUL(0, 0); | |||
MATMUL(0, 1); | |||
MATMUL(1, 0); | |||
MATMUL(1, 1); | |||
ptr_a0 += 32; | |||
ptr_b0 += 32; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2); | |||
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3); | |||
UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1); | |||
UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2) | |||
UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3) | |||
ptr_c0 += 4; | |||
ptr_c1 += 4; | |||
ptr_c2 += 4; | |||
ptr_c3 += 4; | |||
} | |||
if (m & 2) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 2 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(0, 1); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
MATMUL(0, 0); | |||
MATMUL(0, 1); | |||
ptr_a0 += 16; | |||
ptr_b0 += 32; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
vc0 = svuzp1(mc00, mc00); | |||
vc1 = svuzp2(mc00, mc00); | |||
vc2 = svuzp1(mc01, mc01); | |||
vc3 = svuzp2(mc01, mc01); | |||
UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||
UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2); | |||
UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3); | |||
ptr_c0 += 2; | |||
ptr_c1 += 2; | |||
ptr_c2 += 2; | |||
ptr_c3 += 2; | |||
} | |||
if (m & 1) { | |||
ptr_a0 = ptr_a; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(0, 1); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); | |||
MATMUL(0, 0); | |||
MATMUL(0, 1); | |||
ptr_a0 += 16; | |||
ptr_b0 += 32; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); | |||
// use compact is more straightforward | |||
vc1 = svuzp2(mc00, mc00); | |||
vc3 = svuzp2(mc01, mc01); | |||
UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||
UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01); | |||
UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3); | |||
} | |||
ptr_b += 4 * pad_k; | |||
} | |||
if (n & 2) { | |||
ptr_c0 = ptr_c; | |||
ptr_c1 = ptr_c0 + ldc; | |||
ptr_c += 2 * ldc; | |||
ptr_a = (bfloat16_t *)A; | |||
for (BLASLONG i = 0; i < m / 4; i++) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 4 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(1, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
MATMUL(1, 0); | |||
ptr_a0 += 32; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2); | |||
UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2); | |||
ptr_c0 += 4; | |||
ptr_c1 += 4; | |||
} | |||
if (m & 2) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 2 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
vc0 = svuzp1(mc00, mc00); | |||
vc1 = svuzp2(mc00, mc00); | |||
UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); | |||
ptr_c0 += 2; | |||
ptr_c1 += 2; | |||
} | |||
if (m & 1) { | |||
ptr_a0 = ptr_a; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
vc1 = svuzp2(mc00, mc00); | |||
UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); | |||
} | |||
ptr_b += 2 * pad_k; | |||
} | |||
if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our | |||
// case. | |||
ptr_c0 = ptr_c; | |||
ptr_a = (bfloat16_t *)A; | |||
for (BLASLONG i = 0; i < m / 4; i++) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 4 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
INIT_C(1, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
MATMUL(1, 0); | |||
ptr_a0 += 32; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); | |||
ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); | |||
UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); | |||
ptr_c0 += 4; | |||
} | |||
if (m & 2) { | |||
ptr_a0 = ptr_a; | |||
ptr_a += 2 * pad_k; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
vc0 = svuzp1(mc00, mc00); | |||
UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); | |||
ptr_c0 += 2; | |||
} | |||
if (m & 1) { | |||
ptr_a0 = ptr_a; | |||
ptr_b0 = ptr_b; | |||
INIT_C(0, 0); | |||
for (BLASLONG p = 0; p < pad_k; p += 8) { | |||
ma0 = svld1_bf16(pg16_all, ptr_a0); | |||
mb0 = svld1_bf16(pg16_all, ptr_b0); | |||
MATMUL(0, 0); | |||
ptr_a0 += 16; | |||
ptr_b0 += 16; | |||
} | |||
ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); | |||
UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,148 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
IFLOAT *a_offset; | |||
IFLOAT *a_offsetx[4]; | |||
IFLOAT *b_offset; | |||
a_offset = a; | |||
b_offset = b; | |||
bfloat16_t zero_value_bf16; | |||
*((uint16_t *)(&zero_value_bf16)) = 0; | |||
svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine. | |||
svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||
svbfloat16_t v0, v1, v2, v3; | |||
svuint64_t t0, t1; | |||
BLASLONG rest = m & 7; | |||
svbool_t pg16_rest = svwhilelt_b16_s32(0, rest); | |||
for (BLASLONG j = 0; j < n / 4; j++) { | |||
a_offsetx[0] = a_offset; | |||
a_offsetx[1] = a_offsetx[0] + lda; | |||
a_offsetx[2] = a_offsetx[1] + lda; | |||
a_offsetx[3] = a_offsetx[2] + lda; | |||
a_offset += 4 * lda; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||
v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]); | |||
v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||
svreinterpret_bf16_u64(t1)); | |||
a_offsetx[0] += 8; | |||
a_offsetx[1] += 8; | |||
a_offsetx[2] += 8; | |||
a_offsetx[3] += 8; | |||
b_offset += 32; | |||
} | |||
if (rest) { // remainder along k dim | |||
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||
v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]); | |||
v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, | |||
svreinterpret_bf16_u64(t1)); | |||
b_offset += 32; | |||
} | |||
} | |||
if (n & 2) { | |||
a_offsetx[0] = a_offset; | |||
a_offsetx[1] = a_offsetx[0] + lda; | |||
a_offset += 2 * lda; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
b_offset += 16; | |||
a_offsetx[0] += 8; | |||
a_offsetx[1] += 8; | |||
} | |||
if (rest) { // remainder along k dim | |||
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
b_offset += 16; | |||
} | |||
} | |||
if (n & 1) { | |||
a_offsetx[0] = a_offset; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svdup_bf16(zero_value_bf16); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
b_offset += 16; | |||
a_offsetx[0] += 8; | |||
} | |||
if (rest) { // remainder along k dim | |||
v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); | |||
v1 = svdup_bf16(zero_value_bf16); | |||
t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); | |||
svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,361 @@ | |||
/*************************************************************************** | |||
* Copyright (c) 2024-2025, The OpenBLAS Project | |||
* All rights reserved. | |||
* Redistribution and use in source and binary forms, with or without | |||
* modification, are permitted provided that the following conditions are | |||
* met: | |||
* 1. Redistributions of source code must retain the above copyright | |||
* notice, this list of conditions and the following disclaimer. | |||
* 2. Redistributions in binary form must reproduce the above copyright | |||
* notice, this list of conditions and the following disclaimer in | |||
* the documentation and/or other materials provided with the | |||
* distribution. | |||
* 3. Neither the name of the OpenBLAS project nor the names of | |||
* its contributors may be used to endorse or promote products | |||
* derived from this software without specific prior written permission. | |||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
* POSSIBILITY OF SUCH DAMAGE. | |||
* *****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
BLASLONG pad_m = ((m + 7) & ~7); | |||
BLASLONG rest = (m & 7); // rest along m dim | |||
IFLOAT *a_offset; | |||
IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3; | |||
IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7; | |||
IFLOAT *b_offset; | |||
IFLOAT *b_offset0, *b_offset1; | |||
a_offset = a; | |||
b_offset = b; | |||
svuint16_t c0, c1, c2, c3, c4, c5, c6, c7; | |||
svuint16_t t0, t1, t2, t3; | |||
svuint32_t m00, m01, m10, m11; | |||
svuint64_t st_offsets_0, st_offsets_1; | |||
svbool_t pg16_first_4 = svwhilelt_b16(0, 4); | |||
svbool_t pg16_first_8 = svwhilelt_b16(0, 8); | |||
svbool_t pg64_first_4 = svwhilelt_b64(0, 4); | |||
u_int32_t sizeof_u64 = 8; | |||
u_int64_t _st_offsets_0[4] = { | |||
0 * sizeof_u64, | |||
1 * sizeof_u64, | |||
4 * sizeof_u64, | |||
5 * sizeof_u64, | |||
}; | |||
u_int64_t _st_offsets_1[4] = { | |||
2 * sizeof_u64, | |||
3 * sizeof_u64, | |||
6 * sizeof_u64, | |||
7 * sizeof_u64, | |||
}; | |||
st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0); | |||
st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1); | |||
for (BLASLONG j = 0; j < n / 8; j++) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
a_offset += 8; | |||
b_offset0 = b_offset; | |||
b_offset1 = b_offset0 + 4 * pad_m; | |||
b_offset += 8 * pad_m; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
// transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||
// small blocks | |||
c0 = svld1_u16(pg16_first_8, a_offset0); | |||
c1 = svld1_u16(pg16_first_8, a_offset1); | |||
c2 = svld1_u16(pg16_first_8, a_offset2); | |||
c3 = svld1_u16(pg16_first_8, a_offset3); | |||
c4 = svld1_u16(pg16_first_8, a_offset4); | |||
c5 = svld1_u16(pg16_first_8, a_offset5); | |||
c6 = svld1_u16(pg16_first_8, a_offset6); | |||
c7 = svld1_u16(pg16_first_8, a_offset7); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_0, svreinterpret_u64_u32(m10)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_1, svreinterpret_u64_u32(m11)); | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
b_offset0 += 32; | |||
b_offset1 += 32; | |||
} | |||
if (rest) { | |||
c0 = svld1_u16(pg16_first_8, a_offset0); | |||
c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0)); | |||
c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0)); | |||
c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0)); | |||
c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0)); | |||
c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0)); | |||
c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0)); | |||
c7 = (svdup_u16(0)); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_0, svreinterpret_u64_u32(m10)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, | |||
st_offsets_1, svreinterpret_u64_u32(m11)); | |||
} | |||
} | |||
if (n & 4) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
a_offset += 4; | |||
b_offset0 = b_offset; | |||
b_offset += 4 * pad_m; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
// transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 | |||
// small blocks | |||
c0 = svld1_u16(pg16_first_4, a_offset0); | |||
c1 = svld1_u16(pg16_first_4, a_offset1); | |||
c2 = svld1_u16(pg16_first_4, a_offset2); | |||
c3 = svld1_u16(pg16_first_4, a_offset3); | |||
c4 = svld1_u16(pg16_first_4, a_offset4); | |||
c5 = svld1_u16(pg16_first_4, a_offset5); | |||
c6 = svld1_u16(pg16_first_4, a_offset6); | |||
c7 = svld1_u16(pg16_first_4, a_offset7); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
b_offset0 += 32; | |||
} | |||
if (rest) { | |||
c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1 | |||
c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0)); | |||
c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0)); | |||
c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0)); | |||
c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0)); | |||
c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0)); | |||
c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0)); | |||
c7 = (svdup_u16(0)); | |||
t0 = svzip1_u16(c0, c1); | |||
t1 = svzip1_u16(c2, c3); | |||
t2 = svzip1_u16(c4, c5); | |||
t3 = svzip1_u16(c6, c7); | |||
m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); | |||
m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_0, svreinterpret_u64_u32(m00)); | |||
svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, | |||
st_offsets_1, svreinterpret_u64_u32(m01)); | |||
} | |||
} | |||
if (n & 2) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
a_offset += 2; | |||
b_offset0 = b_offset; | |||
b_offset1 = b_offset0 + 8; | |||
b_offset += 2 * pad_m; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
for (BLASLONG line = 0; line < 2; line++) { | |||
b_offset0[line * 4] = a_offset0[line]; | |||
b_offset0[line * 4 + 1] = a_offset1[line]; | |||
b_offset0[line * 4 + 2] = a_offset2[line]; | |||
b_offset0[line * 4 + 3] = a_offset3[line]; | |||
b_offset1[line * 4] = a_offset4[line]; | |||
b_offset1[line * 4 + 1] = a_offset5[line]; | |||
b_offset1[line * 4 + 2] = a_offset6[line]; | |||
b_offset1[line * 4 + 3] = a_offset7[line]; | |||
} | |||
b_offset0 += 16; | |||
b_offset1 += 16; | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
} | |||
if (rest) { | |||
for (BLASLONG line = 0; line < 2; line++) { | |||
b_offset0[line * 4] = a_offset0[line]; | |||
b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||
b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||
b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||
b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line]; | |||
b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line]; | |||
b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line]; | |||
b_offset1[line * 4 + 3] = 0; | |||
} | |||
} | |||
} | |||
if (n & 1) { | |||
a_offset0 = a_offset; | |||
a_offset1 = a_offset0 + lda; | |||
a_offset2 = a_offset1 + lda; | |||
a_offset3 = a_offset2 + lda; | |||
a_offset4 = a_offset3 + lda; | |||
a_offset5 = a_offset4 + lda; | |||
a_offset6 = a_offset5 + lda; | |||
a_offset7 = a_offset6 + lda; | |||
for (BLASLONG i = 0; i < m / 8; i++) { | |||
b_offset[0] = a_offset0[0]; | |||
b_offset[1] = a_offset1[0]; | |||
b_offset[2] = a_offset2[0]; | |||
b_offset[3] = a_offset3[0]; | |||
b_offset[4] = 0; | |||
b_offset[5] = 0; | |||
b_offset[6] = 0; | |||
b_offset[7] = 0; | |||
b_offset[8] = a_offset4[0]; | |||
b_offset[9] = a_offset5[0]; | |||
b_offset[10] = a_offset6[0]; | |||
b_offset[11] = a_offset7[0]; | |||
b_offset[12] = 0; | |||
b_offset[13] = 0; | |||
b_offset[14] = 0; | |||
b_offset[15] = 0; | |||
b_offset += 16; | |||
a_offset0 += 8 * lda; | |||
a_offset1 += 8 * lda; | |||
a_offset2 += 8 * lda; | |||
a_offset3 += 8 * lda; | |||
a_offset4 += 8 * lda; | |||
a_offset5 += 8 * lda; | |||
a_offset6 += 8 * lda; | |||
a_offset7 += 8 * lda; | |||
} | |||
if (rest) { | |||
b_offset[0] = *a_offset0; | |||
b_offset[1] = rest == 1 ? 0 : *a_offset1; | |||
b_offset[2] = rest <= 2 ? 0 : *a_offset2; | |||
b_offset[3] = rest <= 3 ? 0 : *a_offset3; | |||
b_offset[4] = 0; | |||
b_offset[5] = 0; | |||
b_offset[6] = 0; | |||
b_offset[7] = 0; | |||
b_offset[8] = rest <= 4 ? 0 : *a_offset4; | |||
b_offset[9] = rest <= 5 ? 0 : *a_offset5; | |||
b_offset[10] = rest <= 6 ? 0 : *a_offset6; | |||
b_offset[11] = 0; | |||
b_offset[12] = 0; | |||
b_offset[13] = 0; | |||
b_offset[14] = 0; | |||
b_offset[15] = 0; | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,515 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
static void beta_op(float *x, BLASLONG n, FLOAT beta) { | |||
if (beta == 0) { | |||
memset(x, 0, n * sizeof(float)); | |||
return; | |||
} | |||
float32x4_t y0, y1, y2, y3; | |||
for (BLASLONG i = 0; i < n / 16; i++) { | |||
y0 = vld1q_f32(x); | |||
y1 = vld1q_f32(x + 4); | |||
y2 = vld1q_f32(x + 8); | |||
y3 = vld1q_f32(x + 12); | |||
y0 = vmulq_n_f32(y0, beta); | |||
y1 = vmulq_n_f32(y1, beta); | |||
y2 = vmulq_n_f32(y2, beta); | |||
y3 = vmulq_n_f32(y3, beta); | |||
vst1q_f32(x, y0); | |||
vst1q_f32(x + 4, y1); | |||
vst1q_f32(x + 8, y2); | |||
vst1q_f32(x + 12, y3); | |||
x += 16; | |||
} | |||
if (n & 15) { | |||
BLASLONG rest_n = n & 15; | |||
for (BLASLONG i = 0; i < (rest_n) / 4; i++) { | |||
y0 = vld1q_f32(x); | |||
y0 = vmulq_n_f32(y0, beta); | |||
vst1q_f32(x, y0); | |||
x += 4; | |||
} | |||
for (BLASLONG i = 0; i < (rest_n & 3); i ++) { | |||
x[i] *= beta; | |||
} | |||
} | |||
return; | |||
} | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, | |||
bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { | |||
BLASLONG i, j; | |||
bfloat16_t *a_ptr, *x_ptr; | |||
FLOAT *y_ptr; | |||
bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; | |||
bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; | |||
bfloat16x8_t x_vec; | |||
bfloat16x4_t x_vecx4; | |||
float32x4_t y1_vec, y2_vec; | |||
float32x4_t fp32_low, fp32_high; | |||
float x0, x1, x2, x3, x4, x5, x6, x7; | |||
bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6, | |||
*a_ptr7; | |||
a_ptr = (bfloat16_t *)a; | |||
x_ptr = (bfloat16_t *)x; | |||
BLASLONG rest_m = m & 3; | |||
bfloat16x4_t bf16_zero = vreinterpret_bf16_u16(vdup_n_u16(0)); | |||
bfloat16x8_t bf16_zero_q = vreinterpretq_bf16_u16(vdupq_n_u16(0)); | |||
if (incx == 1 && incy == 1) { | |||
if (beta != 1) { | |||
beta_op(y, m, beta); | |||
} | |||
for (i = 0; i < n / 8; i++) { | |||
a_ptr0 = a_ptr; | |||
a_ptr1 = a_ptr0 + lda; | |||
a_ptr2 = a_ptr1 + lda; | |||
a_ptr3 = a_ptr2 + lda; | |||
a_ptr4 = a_ptr3 + lda; | |||
a_ptr5 = a_ptr4 + lda; | |||
a_ptr6 = a_ptr5 + lda; | |||
a_ptr7 = a_ptr6 + lda; | |||
a_ptr += 8 * lda; | |||
y_ptr = y; | |||
x_vec = vld1q_bf16(x_ptr); | |||
if (alpha != 1) { | |||
fp32_low = vreinterpretq_f32_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), | |||
vreinterpretq_u16_bf16(x_vec))); | |||
fp32_high = vreinterpretq_f32_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(bf16_zero_q), | |||
vreinterpretq_u16_bf16(x_vec))); | |||
fp32_low = vmulq_n_f32(fp32_low, alpha); | |||
fp32_high = vmulq_n_f32(fp32_high, alpha); | |||
x_vec = | |||
vcombine_bf16(vcvt_bf16_f32(fp32_low), vcvt_bf16_f32(fp32_high)); | |||
} | |||
for (j = 0; j < m / 8; j++) { | |||
a0 = vld1q_bf16(a_ptr0); | |||
a1 = vld1q_bf16(a_ptr1); | |||
a2 = vld1q_bf16(a_ptr2); | |||
a3 = vld1q_bf16(a_ptr3); | |||
a4 = vld1q_bf16(a_ptr4); | |||
a5 = vld1q_bf16(a_ptr5); | |||
a6 = vld1q_bf16(a_ptr6); | |||
a7 = vld1q_bf16(a_ptr7); | |||
y1_vec = vld1q_f32(y_ptr); | |||
y2_vec = vld1q_f32(y_ptr + 4); | |||
t0 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t1 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
t2 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||
t3 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||
t4 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t5 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
t6 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||
t7 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); | |||
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t4, x_vec, 0); | |||
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t4, x_vec, 1); | |||
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t5, x_vec, 2); | |||
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t5, x_vec, 3); | |||
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t6, x_vec, 4); | |||
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t6, x_vec, 5); | |||
y2_vec = vbfmlalbq_laneq_f32(y2_vec, t7, x_vec, 6); | |||
y2_vec = vbfmlaltq_laneq_f32(y2_vec, t7, x_vec, 7); | |||
vst1q_f32(y_ptr, y1_vec); | |||
vst1q_f32(y_ptr + 4, y2_vec); | |||
a_ptr0 += 8; | |||
a_ptr1 += 8; | |||
a_ptr2 += 8; | |||
a_ptr3 += 8; | |||
a_ptr4 += 8; | |||
a_ptr5 += 8; | |||
a_ptr6 += 8; | |||
a_ptr7 += 8; | |||
y_ptr += 8; | |||
} | |||
if (m & 4) { | |||
bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||
bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||
bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); | |||
bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); | |||
bfloat16x4_t a4x4 = vld1_bf16(a_ptr4); | |||
bfloat16x4_t a5x4 = vld1_bf16(a_ptr5); | |||
bfloat16x4_t a6x4 = vld1_bf16(a_ptr6); | |||
bfloat16x4_t a7x4 = vld1_bf16(a_ptr7); | |||
y1_vec = vld1q_f32(y_ptr); | |||
a0 = vcombine_bf16(a0x4, bf16_zero); | |||
a1 = vcombine_bf16(a1x4, bf16_zero); | |||
a2 = vcombine_bf16(a2x4, bf16_zero); | |||
a3 = vcombine_bf16(a3x4, bf16_zero); | |||
a4 = vcombine_bf16(a4x4, bf16_zero); | |||
a5 = vcombine_bf16(a5x4, bf16_zero); | |||
a6 = vcombine_bf16(a6x4, bf16_zero); | |||
a7 = vcombine_bf16(a7x4, bf16_zero); | |||
t0 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t1 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
t2 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); | |||
t3 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); | |||
y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); | |||
y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); | |||
vst1q_f32(y_ptr, y1_vec); | |||
a_ptr0 += 4; | |||
a_ptr1 += 4; | |||
a_ptr2 += 4; | |||
a_ptr3 += 4; | |||
a_ptr4 += 4; | |||
a_ptr5 += 4; | |||
a_ptr6 += 4; | |||
a_ptr7 += 4; | |||
y_ptr += 4; | |||
} | |||
if (rest_m) { | |||
x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); | |||
x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); | |||
x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); | |||
x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); | |||
x4 = alpha * vcvtah_f32_bf16(x_ptr[4]); | |||
x5 = alpha * vcvtah_f32_bf16(x_ptr[5]); | |||
x6 = alpha * vcvtah_f32_bf16(x_ptr[6]); | |||
x7 = alpha * vcvtah_f32_bf16(x_ptr[7]); | |||
for (BLASLONG j = 0; j < rest_m; j++) { | |||
y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||
y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); | |||
y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); | |||
y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); | |||
y_ptr[j] += x4 * vcvtah_f32_bf16(a_ptr4[j]); | |||
y_ptr[j] += x5 * vcvtah_f32_bf16(a_ptr5[j]); | |||
y_ptr[j] += x6 * vcvtah_f32_bf16(a_ptr6[j]); | |||
y_ptr[j] += x7 * vcvtah_f32_bf16(a_ptr7[j]); | |||
} | |||
} | |||
x_ptr += 8; | |||
} | |||
if (n & 4) { | |||
a_ptr0 = a_ptr; | |||
a_ptr1 = a_ptr0 + lda; | |||
a_ptr2 = a_ptr1 + lda; | |||
a_ptr3 = a_ptr2 + lda; | |||
a_ptr += 4 * lda; | |||
x_vecx4 = vld1_bf16(x_ptr); | |||
if (alpha != 1) { | |||
fp32_low = vcvt_f32_bf16(x_vecx4); | |||
fp32_low = vmulq_n_f32(fp32_low, alpha); | |||
x_vecx4 = vcvt_bf16_f32(fp32_low); | |||
} | |||
y_ptr = y; | |||
for (j = 0; j < m / 8; j++) { | |||
a0 = vld1q_bf16(a_ptr0); | |||
a1 = vld1q_bf16(a_ptr1); | |||
a2 = vld1q_bf16(a_ptr2); | |||
a3 = vld1q_bf16(a_ptr3); | |||
y1_vec = vld1q_f32(y_ptr); | |||
y2_vec = vld1q_f32(y_ptr + 4); | |||
t0 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t1 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
t4 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t5 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); | |||
y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); | |||
y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); | |||
y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); | |||
y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); | |||
y2_vec = vbfmlalbq_lane_f32(y2_vec, t5, x_vecx4, 2); | |||
y2_vec = vbfmlaltq_lane_f32(y2_vec, t5, x_vecx4, 3); | |||
vst1q_f32(y_ptr, y1_vec); | |||
vst1q_f32(y_ptr + 4, y2_vec); | |||
a_ptr0 += 8; | |||
a_ptr1 += 8; | |||
a_ptr2 += 8; | |||
a_ptr3 += 8; | |||
y_ptr += 8; | |||
} | |||
if (m & 4) { | |||
bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||
bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||
bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); | |||
bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); | |||
y1_vec = vld1q_f32(y_ptr); | |||
a0 = vcombine_bf16(a0x4, a2x4); | |||
a1 = vcombine_bf16(a1x4, a3x4); | |||
t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t1 = vreinterpretq_bf16_u16(vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); | |||
y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); | |||
vst1q_f32(y_ptr, y1_vec); | |||
a_ptr0 += 4; | |||
a_ptr1 += 4; | |||
a_ptr2 += 4; | |||
a_ptr3 += 4; | |||
y_ptr += 4; | |||
} | |||
if (rest_m) { | |||
fp32_low = vcvt_f32_bf16(x_vecx4); | |||
x0 = vgetq_lane_f32(fp32_low, 0); | |||
x1 = vgetq_lane_f32(fp32_low, 1); | |||
x2 = vgetq_lane_f32(fp32_low, 2); | |||
x3 = vgetq_lane_f32(fp32_low, 3); | |||
for (BLASLONG j = 0; j < rest_m; j++) { | |||
y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||
y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); | |||
y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); | |||
y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); | |||
} | |||
} | |||
x_ptr += 4; | |||
} | |||
if (n & 2) { | |||
a_ptr0 = a_ptr; | |||
a_ptr1 = a_ptr0 + lda; | |||
a_ptr += 2 * lda; | |||
x_vecx4 = vreinterpret_bf16_u16(vzip1_u16( | |||
vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[0])), | |||
vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[1])) | |||
)); | |||
if (alpha != 1) { | |||
fp32_low = vcvt_f32_bf16(x_vecx4); | |||
fp32_low = vmulq_n_f32(fp32_low, alpha); | |||
x_vecx4 = vcvt_bf16_f32(fp32_low); | |||
} | |||
y_ptr = y; | |||
for (j = 0; j < m / 8; j++) { | |||
a0 = vld1q_bf16(a_ptr0); | |||
a1 = vld1q_bf16(a_ptr1); | |||
y1_vec = vld1q_f32(y_ptr); | |||
y2_vec = vld1q_f32(y_ptr + 4); | |||
t0 = vreinterpretq_bf16_u16( | |||
vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
t1 = vreinterpretq_bf16_u16( | |||
vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
y2_vec = vbfmlalbq_lane_f32(y2_vec, t1, x_vecx4, 0); | |||
y2_vec = vbfmlaltq_lane_f32(y2_vec, t1, x_vecx4, 1); | |||
vst1q_f32(y_ptr, y1_vec); | |||
vst1q_f32(y_ptr + 4, y2_vec); | |||
a_ptr0 += 8; | |||
a_ptr1 += 8; | |||
y_ptr += 8; | |||
} | |||
if (m & 4) { | |||
bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); | |||
bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); | |||
y1_vec = vld1q_f32(y_ptr); | |||
a0 = vcombine_bf16(a0x4, bf16_zero); | |||
a1 = vcombine_bf16(a1x4, bf16_zero); | |||
t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); | |||
y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); | |||
y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); | |||
vst1q_f32(y_ptr, y1_vec); | |||
a_ptr0 += 4; | |||
a_ptr1 += 4; | |||
y_ptr += 4; | |||
} | |||
if (m & 2) { | |||
fp32_low = vcvt_f32_bf16(x_vecx4); | |||
x0 = vgetq_lane_f32(fp32_low, 0); | |||
x1 = vgetq_lane_f32(fp32_low, 1); | |||
y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); | |||
y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); | |||
y_ptr[1] += x0 * vcvtah_f32_bf16(a_ptr0[1]); | |||
y_ptr[1] += x1 * vcvtah_f32_bf16(a_ptr1[1]); | |||
a_ptr0 += 2; | |||
a_ptr1 += 2; | |||
y_ptr += 2; | |||
} | |||
if (m & 1) { | |||
fp32_low = vcvt_f32_bf16(x_vecx4); | |||
x0 = vgetq_lane_f32(fp32_low, 0); | |||
x1 = vgetq_lane_f32(fp32_low, 1); | |||
y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); | |||
y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); | |||
} | |||
x_ptr += 2; | |||
} | |||
if (n & 1) { | |||
x0 = vcvtah_f32_bf16(x_ptr[0]) * alpha; | |||
y_ptr = y; | |||
a_ptr0 = a_ptr; | |||
for (j = 0; j < m; j++) { | |||
y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); | |||
} | |||
} | |||
return (0); | |||
} | |||
BLASLONG iy = 0; | |||
for (i = 0; i < m; i++) { | |||
y[iy] *= beta; | |||
iy += incy; | |||
} | |||
for (j = 0; j < n; j++) { | |||
x0 = alpha * vcvtah_f32_bf16(*x_ptr); | |||
iy = 0; | |||
for (i = 0; i < m; i++) { | |||
y[iy] += x0 * vcvtah_f32_bf16(a_ptr[i]); | |||
iy += incy; | |||
} | |||
a_ptr += lda; | |||
x_ptr += incx; | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,202 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_neon.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) | |||
{ | |||
if (m < 1 || n < 1) return(0); | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
bfloat16_t *a_ptr; | |||
bfloat16_t *x_ptr; | |||
float *y_ptr; | |||
float temp; | |||
iy = 0; | |||
a_ptr = (bfloat16_t*)(a); | |||
x_ptr = (bfloat16_t*)(x); | |||
if (incx == 1) { | |||
BLASLONG width = n / 4; | |||
bfloat16_t *a0_ptr = a_ptr + lda * width * 0; | |||
bfloat16_t *a1_ptr = a_ptr + lda * width * 1; | |||
bfloat16_t *a2_ptr = a_ptr + lda * width * 2; | |||
bfloat16_t *a3_ptr = a_ptr + lda * width * 3; | |||
float *y0_ptr = y + incy * width * 0; | |||
float *y1_ptr = y + incy * width * 1; | |||
float *y2_ptr = y + incy * width * 2; | |||
float *y3_ptr = y + incy * width * 3; | |||
for (j = 0; j < width; j++) { | |||
float32x4_t temp0_vec = vdupq_n_f32(0.0f); | |||
float32x4_t temp1_vec = vdupq_n_f32(0.0f); | |||
float32x4_t temp2_vec = vdupq_n_f32(0.0f); | |||
float32x4_t temp3_vec = vdupq_n_f32(0.0f); | |||
i = 0; | |||
while (i + 7 < m) { | |||
bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); | |||
bfloat16x8_t a0_vec = vld1q_bf16(a0_ptr + i); | |||
bfloat16x8_t a1_vec = vld1q_bf16(a1_ptr + i); | |||
bfloat16x8_t a2_vec = vld1q_bf16(a2_ptr + i); | |||
bfloat16x8_t a3_vec = vld1q_bf16(a3_ptr + i); | |||
temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); | |||
temp1_vec = vbfdotq_f32(temp1_vec, a1_vec, x_vec); | |||
temp2_vec = vbfdotq_f32(temp2_vec, a2_vec, x_vec); | |||
temp3_vec = vbfdotq_f32(temp3_vec, a3_vec, x_vec); | |||
i += 8; | |||
} | |||
if (i + 3 < m) { | |||
float32x2_t t0 = vdup_n_f32(0.0f); | |||
float32x2_t t1 = vdup_n_f32(0.0f); | |||
float32x2_t t2 = vdup_n_f32(0.0f); | |||
float32x2_t t3 = vdup_n_f32(0.0f); | |||
bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); | |||
bfloat16x4_t a0_vec = vld1_bf16(a0_ptr + i); | |||
bfloat16x4_t a1_vec = vld1_bf16(a1_ptr + i); | |||
bfloat16x4_t a2_vec = vld1_bf16(a2_ptr + i); | |||
bfloat16x4_t a3_vec = vld1_bf16(a3_ptr + i); | |||
t0 = vbfdot_f32(t0, a0_vec, x_vec); | |||
t1 = vbfdot_f32(t1, a1_vec, x_vec); | |||
t2 = vbfdot_f32(t2, a2_vec, x_vec); | |||
t3 = vbfdot_f32(t3, a3_vec, x_vec); | |||
float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); | |||
float32x2_t temp1_vec_low = vget_low_f32(temp1_vec); | |||
float32x2_t temp2_vec_low = vget_low_f32(temp2_vec); | |||
float32x2_t temp3_vec_low = vget_low_f32(temp3_vec); | |||
temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); | |||
temp1_vec = vcombine_f32(vadd_f32(t1, temp1_vec_low), vget_high_f32(temp1_vec)); | |||
temp2_vec = vcombine_f32(vadd_f32(t2, temp2_vec_low), vget_high_f32(temp2_vec)); | |||
temp3_vec = vcombine_f32(vadd_f32(t3, temp3_vec_low), vget_high_f32(temp3_vec)); | |||
i += 4; | |||
} | |||
if (beta == 0.0f) { | |||
y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec); | |||
y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec); | |||
y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec); | |||
y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec); | |||
} | |||
else { | |||
y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy]; | |||
y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy]; | |||
y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy]; | |||
y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy]; | |||
} | |||
for (; i < m; ++i) { | |||
y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
} | |||
iy += incy; | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
a3_ptr += lda; | |||
} | |||
a_ptr = a3_ptr; | |||
y_ptr = y3_ptr; | |||
for (j = width * 4; j < n; j++) { | |||
float32x4_t temp0_vec = vdupq_n_f32(0.0f); | |||
i = 0; | |||
while (i + 7 < m) { | |||
bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); | |||
bfloat16x8_t a0_vec = vld1q_bf16(a_ptr + i); | |||
temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); | |||
i += 8; | |||
} | |||
if (i + 3 < m) { | |||
float32x2_t t0 = vdup_n_f32(0.0f); | |||
bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); | |||
bfloat16x4_t a0_vec = vld1_bf16(a_ptr + i); | |||
t0 = vbfdot_f32(t0, a0_vec, x_vec); | |||
float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); | |||
temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); | |||
i += 4; | |||
} | |||
if (beta == 0.0f) { | |||
y_ptr[iy] = alpha * vaddvq_f32(temp0_vec); | |||
} | |||
else { | |||
y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy]; | |||
} | |||
for (; i < m; ++i) { | |||
y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); | |||
} | |||
iy += incy; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} | |||
for (j = 0; j < n; j++) { | |||
temp = 0.0; | |||
ix = 0; | |||
for (i = 0; i < m; i++) { | |||
temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); | |||
ix += incx; | |||
} | |||
if (beta == 0.0f) { | |||
y[iy] = alpha * temp; | |||
} | |||
else { | |||
y[iy] = alpha * temp + beta * y[iy]; | |||
} | |||
iy += incy; | |||
a_ptr += lda; | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,80 @@ | |||
/* | |||
Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. | |||
SPDX-License-Identifier: BSD-3-Clause-Clear | |||
*/ | |||
#include "common.h" | |||
#include <stdlib.h> | |||
#include <inttypes.h> | |||
#include <math.h> | |||
#if defined(HAVE_SME) | |||
/* Function prototypes */ | |||
extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ | |||
const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); | |||
extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ | |||
const float * matLeft,\ | |||
const float * restrict matRight,\ | |||
const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); | |||
/* Function Definitions */ | |||
uint64_t sve_cntw() { | |||
uint64_t cnt; | |||
asm volatile( | |||
"rdsvl %[res], #1\n" | |||
"lsr %[res], %[res], #2\n" | |||
: [res] "=r" (cnt) :: | |||
); | |||
return cnt; | |||
} | |||
/*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ | |||
float * __restrict A, BLASLONG strideA, float * __restrict B,\ | |||
BLASLONG strideB , float * __restrict R, BLASLONG strideR) | |||
*/ | |||
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||
float * __restrict R, BLASLONG strideR){ | |||
uint64_t m_mod, vl_elms; | |||
vl_elms = sve_cntw(); | |||
m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | |||
float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | |||
/* Prevent compiler optimization by reading from memory instead | |||
* of reading directly from vector (z) registers. | |||
* */ | |||
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||
/* Pre-process the left matrix to make it suitable for | |||
matrix sum of outer-product calculation | |||
*/ | |||
sgemm_direct_sme1_preprocess(M, K, A, A_mod); | |||
/* Calculate C = A*B */ | |||
sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | |||
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||
free(A_mod); | |||
} | |||
#else | |||
void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||
BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ | |||
float * __restrict R, BLASLONG strideR){} | |||
#endif |
@@ -0,0 +1,228 @@ | |||
/* | |||
Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. | |||
SPDX-License-Identifier: BSD-3-Clause-Clear | |||
*/ | |||
/*-------------------------------------------------------------------------- | |||
* SME1 based Matrix multiplication code for FP32 input matrices to FP32 | |||
* output matrix | |||
* C = A*B | |||
* A: Left input matrix of dimension M x K | |||
* B: Right input matrix of dimension K x N | |||
* C: Result matrix of dimension M x N | |||
* | |||
* Usage of function: | |||
* sgemm_direct_sme1_2VLx2VL( uint64_t M , uint64_t K, uint64_t N,\ | |||
const float * restrict A_base,\ | |||
const float * restrict B_base,\ | |||
const float * restrict C_base); | |||
----------------------------------------------------------------------------*/ | |||
#define M x0 //M dimension | |||
#define K x1 //K dimension | |||
#define N x2 //N dimension | |||
#define A_base x3 //Pointer to left matrix(A) | |||
#define B_base x4 //Pointer to right matrix(B) | |||
#define C_base x5 //Pointer to result matrix(C) | |||
#define Aptr x6 //Pointer to traverse A | |||
#define Aptr_end x7 //Pointer to end of row of A | |||
#define Cptr x8 //Pointer to traverse C | |||
#define Cptr0 x9 //2nd Pointer to traverse C | |||
#define Cptr1 x10 //3rd Pointer to traverse C | |||
#define Bptr x11 //Pointer to traverse B | |||
#define Bptr0 x12 //2nd Pointer to traverse B | |||
#define N_exit x14 //Exit condition for N loop | |||
#define K_exit x15 //Exit condition for K loop | |||
#define M_cntr x16 //M loop counter | |||
#define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements | |||
#define C2 x18 //Constant2: N + SVLs | |||
#define C3 x19 //Constant3: K*SVLs + SVLs | |||
#define C4 x20 //Constant4: SVLs-2 | |||
#define C5 x21 //Constant5: K*SVLs | |||
#define C6 x22 //Constant6: N*SVLs | |||
.text | |||
.global sgemm_direct_sme1_2VLx2VL | |||
sgemm_direct_sme1_2VLx2VL: | |||
stp x19, x20, [sp, #-48]! | |||
stp x21, x22, [sp, #16] | |||
stp x23, x24, [sp, #32] | |||
smstart | |||
cntw C4 //SVLs | |||
mul C5, C4, K //K*SVLs | |||
mul C6, C4, N //N*SVLs | |||
add C1, C6, N //N*SVLs + N | |||
add N_exit, B_base, N, lsl #2 //N_Loop exit conditon | |||
mov M_cntr, #0 | |||
add C2, N, C4 //N + SVLs | |||
add C3, C5, C4 //K*SVLs + SVLs | |||
whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) | |||
sub w20, w20, #2 //SVLs-2 | |||
.M_Loop: | |||
incw M_cntr | |||
whilelt p3.s, M_cntr, M //Tile 2,3 predicate (M dimension) | |||
mov Bptr, B_base //B_base | |||
mov Cptr, C_base //C_base | |||
whilelt p0.b, Bptr, N_exit //Tile 0/2 predicate (N dimension) | |||
.N_Loop: | |||
mov Aptr, A_base //Aptr = A_base | |||
mov Bptr0, Bptr //Bptr = B_base | |||
mov Cptr0, Cptr //Cptr0 = C_base | |||
addvl Cptr1, Cptr, #1 //Cptr1 = C_base + SVLb | |||
addvl Bptr, Bptr, #1 | |||
whilelt p1.b, Bptr, N_exit //Tile 1,3 predicate (N dimension) | |||
add Aptr_end, A_base, C5, lsl #2 //A_base + K*SVLs | |||
addvl K_exit, Aptr_end, #-1 //Exit condition for K loop | |||
//Load 1st vector from Aptr | |||
ld1w {z1.s}, p2/z, [Aptr] | |||
zero {za} | |||
// Load 1st vector from Bptr | |||
ld1w {z2.s}, p0/z, [Bptr0] | |||
// ZA0 += 1st Aptr vector OP 1st Bptr vector | |||
fmopa za0.s, p2/m, p0/m, z1.s, z2.s | |||
// Load 2nd vector from Aptr | |||
ld1w {z5.s}, p3/z, [Aptr, C5, lsl #2] | |||
// Aptr += SVLb | |||
addvl Aptr, Aptr, #1 | |||
.K_Loop: | |||
// ZA2 += 2nd Aptr vector OP 1st Bptr vector | |||
fmopa za2.s, p3/m, p0/m, z5.s, z2.s | |||
// Load 2nd vector from Bptr | |||
ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] | |||
// ZA1 += 1st Aptr vector OP 2nd Bptr vector | |||
fmopa za1.s, p2/m, p1/m, z1.s, z3.s | |||
// Load next 1st vector from Aptr | |||
ld1w {z0.s}, p2/z, [Aptr] | |||
// ZA3 += 2nd Aptr vector OP 2nd Bptr vector | |||
fmopa za3.s, p3/m, p1/m, z5.s, z3.s | |||
cmp K, #2 | |||
b.le process_K_less_than_equal_2 | |||
// Load next 1st vector from Bptr | |||
ld1w {z6.s}, p0/z, [Bptr0, N, lsl #2] | |||
// ZA0 += 1st Aptr vector OP 1st Bptr vector | |||
fmopa za0.s, p2/m, p0/m, z0.s, z6.s | |||
// Load next 2nd vector from Aptr | |||
ld1w {z4.s}, p3/z, [Aptr, C5, lsl #2] | |||
// ZA2 += 2nd Aptr vector OP 1st Bptr vector | |||
fmopa za2.s, p3/m, p0/m, z4.s, z6.s | |||
// Load next 2nd vector from Bptr | |||
ld1w {z7.s}, p1/z, [Bptr0, C2, lsl #2] | |||
// Bptr += 2*ldb FP32 elms [Bytes] | |||
add Bptr0, Bptr0, N, lsl #3 | |||
// ZA1 += 1st Aptr vector OP 2nd Bptr vector | |||
fmopa za1.s, p2/m, p1/m, z0.s, z7.s | |||
// Load next 2nd vector from Aptr | |||
ld1w {z1.s}, p2/z, [Aptr, #1, MUL VL] | |||
// ZA3 += 2nd Aptr vector OP 2nd Bptr vector | |||
fmopa za3.s, p3/m, p1/m, z4.s, z7.s | |||
// Load next 1st vector from Bptr | |||
ld1w {z2.s}, p0/z, [Bptr0] | |||
// ZA0 += 1st Aptr vector OP 1st Bptr vector | |||
fmopa za0.s, p2/m, p0/m, z1.s, z2.s | |||
// Load next 2nd vector from Aptr | |||
ld1w {z5.s}, p3/z, [Aptr, C3, lsl #2] | |||
// Aptr += 2*SVLb [Bytes] | |||
addvl Aptr, Aptr, #2 | |||
cmp Aptr, K_exit | |||
b.mi .K_Loop | |||
// ZA2 += 2nd Aptr vector OP 1st Bptr vector | |||
fmopa za2.s, p3/m, p0/m, z5.s, z2.s | |||
// Load next 2nd vector from Bptr | |||
ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] | |||
// ZA1 += 1st Aptr vector OP 2nd Bptr vector | |||
fmopa za1.s, p2/m, p1/m, z1.s, z3.s | |||
// ZA3 += 2nd Aptr vector OP 2nd Bptr vector | |||
fmopa za3.s, p3/m, p1/m, z5.s, z3.s | |||
process_K_less_than_equal_2: | |||
// Bptr += 2*ldb FP32 elements | |||
add Bptr0, Bptr0, N, lsl #2 | |||
cmp Aptr, Aptr_end | |||
b.pl .Ktail_end | |||
.Ktail_start: | |||
ld1w {z1.s}, p2/z, [Aptr] | |||
ld1w {z2.s}, p0/z, [Bptr0] | |||
ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] | |||
fmopa za0.s, p2/m, p0/m, z1.s, z2.s | |||
ld1w {z5.s}, p3/z, [Aptr, C5, lsl #2] | |||
fmopa za2.s, p3/m, p0/m, z5.s, z2.s | |||
fmopa za1.s, p2/m, p1/m, z1.s, z3.s | |||
fmopa za3.s, p3/m, p1/m, z5.s, z3.s | |||
.Ktail_end: | |||
mov w13, #0 | |||
psel p4, p0, p2.s[w13, 0] | |||
psel p5, p1, p2.s[w13, 0] | |||
psel p6, p0, p3.s[w13, 0] | |||
psel p7, p1, p3.s[w13, 0] | |||
// Store to Cptr0 | |||
st1w {za0h.s[w13, #0]}, p4, [Cptr0] | |||
// Store to Cptr1 | |||
st1w {za1h.s[w13, #0]}, p5, [Cptr1] | |||
// Store to Cptr0 + N*SVLs | |||
st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] | |||
// Store to Cptr1 + N*SVLs | |||
st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] | |||
.Loop_store_ZA: | |||
psel p4, p0, p2.s[w13, 1] | |||
psel p5, p1, p2.s[w13, 1] | |||
psel p6, p0, p3.s[w13, 1] | |||
psel p7, p1, p3.s[w13, 1] | |||
// Store to Cptr0 + N | |||
st1w {za0h.s[w13, #1]}, p4, [Cptr0, N, lsl #2] | |||
// Store to Cptr1 + N | |||
st1w {za1h.s[w13, #1]}, p5, [Cptr1, N, lsl #2] | |||
// Store to Cptr0 + N*(SVLs+1) | |||
st1w {za2h.s[w13, #1]}, p6, [Cptr0, C1, lsl #2] | |||
// Store to Cptr1 + N*(SVLs+1) | |||
st1w {za3h.s[w13, #1]}, p7, [Cptr1, C1, lsl #2] | |||
add Cptr0, Cptr0, N, lsl #3 //Cptr0 += 2*N FP32 elements | |||
add Cptr1, Cptr1, N, lsl #3 //Cptr1 += 2*N FP32 elements | |||
add w13, w13, #2 | |||
psel p4, p0, p2.s[w13, 0] | |||
psel p5, p1, p2.s[w13, 0] | |||
psel p6, p0, p3.s[w13, 0] | |||
psel p7, p1, p3.s[w13, 0] | |||
st1w {za0h.s[w13, #0]}, p4, [Cptr0] | |||
st1w {za1h.s[w13, #0]}, p5, [Cptr1] | |||
st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] | |||
st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] | |||
cmp w13, w20 | |||
b.mi .Loop_store_ZA | |||
psel p4, p0, p2.s[w13, 1] | |||
psel p5, p1, p2.s[w13, 1] | |||
psel p6, p0, p3.s[w13, 1] | |||
psel p7, p1, p3.s[w13, 1] | |||
st1w {za0h.s[w13, #1]}, p4, [Cptr0, N, lsl #2] | |||
st1w {za1h.s[w13, #1]}, p5, [Cptr1, N, lsl #2] | |||
st1w {za2h.s[w13, #1]}, p6, [Cptr0, C1, lsl #2] | |||
st1w {za3h.s[w13, #1]}, p7, [Cptr1, C1, lsl #2] | |||
addvl Cptr, Cptr, #2 | |||
addvl Bptr, Bptr, #1 | |||
whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) | |||
b.first .N_Loop | |||
add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements | |||
add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements | |||
incw M_cntr | |||
whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) | |||
b.first .M_Loop | |||
smstop | |||
ldp x23, x24, [sp, #32] | |||
ldp x21, x22, [sp, #16] | |||
ldp x19, x20, [sp], #48 | |||
ret | |||
@@ -0,0 +1,133 @@ | |||
/* | |||
Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. | |||
SPDX-License-Identifier: BSD-3-Clause-Clear | |||
*/ | |||
/*---------------------------------------------------------------------------- | |||
* This function is used to re-arrange the elements of input matrix to | |||
* make it suitable for matrix outer product computation using SME for matrix | |||
* multiplication. It should be used to pre-process the leftmatrix(A) in the | |||
* matrix muliplication (C= A*B) using sgemm_direct_sme1_2VLx2VL() | |||
* | |||
* The pre-processing transposes a block of SVLs rows of the input matrix and | |||
* stores it contiguously. The same is applied to remaining blocks of SVLs | |||
* rows. The last block of SVLs rows is zero-padded to SVLs rows if needed. | |||
* | |||
* Usage of function: | |||
* sgemm_direct_sme1_preprocess(uint64_t nrow, uint64_t ncol, \ | |||
* const float * restrict mat, float * mat_mod); | |||
* | |||
----------------------------------------------------------------------------*/ | |||
#define nrow x0 //Number of rows of input matrix | |||
#define ncol x1 //Number of coulumns of input matrix | |||
#define mat x2 //Input matrix base address | |||
#define mat_mod x3 //Output matrix (re-arranged matrix) base address | |||
#define mat_mod_ptr x4 //Pointer to output matrix | |||
#define mat_ptr0 x5 //Pointer to input matrix | |||
#define mat_ptr1 x6 //2nd pointer to input matrix | |||
#define outer_loop_cntr x7 //Outer loop counter | |||
#define inner_loop_exit x8 //Inner loop exit condition | |||
#define C1 x9 //Constant1: SVLs - No. of 32-bit elements | |||
#define C2 x10 //Constant2: 3*SVLs | |||
#define C3 x11 //Constant3: ncol*SVLs | |||
#define C4 x13 //Constant4: 2*SVLs | |||
#define C5 x14 //Constant5: 2*ncol | |||
#define C6 x15 //Constant6: 3*ncol | |||
.text | |||
.global sgemm_direct_sme1_preprocess | |||
sgemm_direct_sme1_preprocess: | |||
stp x19, x20, [sp, #-48]! | |||
stp x21, x22, [sp, #16] | |||
stp x23, x24, [sp, #32] | |||
smstart | |||
cntw C1 //SVLs | |||
mul C3, C1, ncol //SVLs*ncol | |||
lsl C5, ncol, #1 //2*ncol | |||
add C6, C5, ncol //3*ncol | |||
cnth C4 //2*SVLs | |||
add C2, C1, C1, lsl #1 //3*SVLs | |||
mov outer_loop_cntr, #0 | |||
//Tile predicate (M dimension) | |||
whilelt p0.s, outer_loop_cntr, nrow | |||
//Predicate for stores | |||
ptrue p9.s | |||
.M_Loop: | |||
mov mat_ptr0, mat //Load base address of mat | |||
mov mat_mod_ptr, mat_mod //a_mod store base address | |||
add inner_loop_exit, mat, ncol, lsl #2 //Exit condition for inner loop | |||
whilelt p8.b, mat_ptr0, inner_loop_exit //Tile predicate (K dimension) | |||
.Loop_process: | |||
mov mat_ptr1, mat_ptr0 | |||
//Load_to_tile loop counter | |||
mov w12, #0 | |||
.Load_to_tile: | |||
psel p2, p8, p0.s[w12, 0] | |||
psel p3, p8, p0.s[w12, 1] | |||
psel p4, p8, p0.s[w12, 2] | |||
psel p5, p8, p0.s[w12, 3] | |||
//Load 1st row from mat_ptr1 | |||
ld1w {za0h.s[w12, #0]}, p2/z, [mat_ptr1] | |||
//Load 2nd row from mat_ptr1 + ncol | |||
ld1w {za0h.s[w12, #1]}, p3/z, [mat_ptr1, ncol, lsl #2] | |||
//Load 3rd row from mat_ptr1 + 2*ncol | |||
ld1w {za0h.s[w12, #2]}, p4/z, [mat_ptr1, C5, lsl #2] | |||
//Load 4th row from mat_ptr1 + 3*ncol | |||
ld1w {za0h.s[w12, #3]}, p5/z, [mat_ptr1, C6, lsl #2] | |||
//mat_ptr1+=4*ncol FP32 elements | |||
add mat_ptr1, mat_ptr1, ncol, lsl #4 | |||
//Increment counter | |||
add w12, w12, #4 | |||
cmp w12, w9 | |||
b.mi .Load_to_tile | |||
// Store_from_tile loop counter | |||
mov w12, #0 | |||
.Store_from_tile: | |||
psel p2, p9, p8.s[w12, 0] | |||
psel p3, p9, p8.s[w12, 1] | |||
psel p4, p9, p8.s[w12, 2] | |||
psel p5, p9, p8.s[w12, 3] | |||
//Store 1st col to mat_mod | |||
st1w {za0v.s[w12, #0]}, p2, [mat_mod_ptr] | |||
//Store 2nd col to mat_mod + SVLs | |||
st1w {za0v.s[w12, #1]}, p3, [mat_mod_ptr, C1, lsl #2] | |||
//Store 3rd col to mat_mod + 2*SVLs | |||
st1w {za0v.s[w12, #2]}, p4, [mat_mod_ptr, C4, lsl #2] | |||
//Store 4th col to mat_mod + 3*SVLs | |||
st1w {za0v.s[w12, #3]}, p5, [mat_mod_ptr, C2, lsl #2] | |||
addvl mat_mod_ptr, mat_mod_ptr, #4 //mat_mod_ptr += 4*SVLb | |||
add w12, w12, #4 //Increment counter | |||
cmp w12, w9 | |||
b.mi .Store_from_tile | |||
addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb | |||
whilelt p8.b, mat_ptr0, inner_loop_exit | |||
b.first .Loop_process | |||
add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements | |||
add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements | |||
incw outer_loop_cntr | |||
whilelt p0.s, outer_loop_cntr, nrow | |||
b.first .M_Loop | |||
smstop | |||
ldp x23, x24, [sp, #32] | |||
ldp x21, x22, [sp, #16] | |||
ldp x19, x20, [sp], #48 | |||
ret | |||
@@ -88,28 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ldr q1, [A02], #16 | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
ldr q2, [A03], #16 | |||
ins v8.s[2], v2.s[0] | |||
ins v9.s[2], v2.s[1] | |||
ins v10.s[2], v2.s[2] | |||
ins v11.s[2], v2.s[3] | |||
ldr q3, [A04], #16 | |||
ins v8.s[3], v3.s[0] | |||
ins v9.s[3], v3.s[1] | |||
ins v10.s[3], v3.s[2] | |||
ins v11.s[3], v3.s[3] | |||
zip1 v12.4s, v0.4s, v1.4s | |||
zip1 v13.4s, v2.4s, v3.4s | |||
zip2 v14.4s, v0.4s, v1.4s | |||
zip2 v15.4s, v2.4s, v3.4s | |||
zip1 v8.2d, v12.2d, v13.2d | |||
zip2 v9.2d, v12.2d, v13.2d | |||
zip1 v10.2d, v14.2d, v15.2d | |||
zip2 v11.2d, v14.2d, v15.2d | |||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
add B00, B00, #64 | |||
@@ -138,16 +129,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ldr q1, [A02], #16 | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
zip1 v12.4s, v0.4s, v1.4s | |||
zip2 v13.4s, v0.4s, v1.4s | |||
dup v8.2d, v12.d[0] | |||
dup v9.2d, v12.d[1] | |||
dup v10.2d, v13.d[0] | |||
dup v11.2d , v13.d[1] | |||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
add B00, B00, #32 | |||
@@ -330,4 +320,3 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ret | |||
EPILOGUE | |||
@@ -86,47 +86,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro COPY4x8 | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v10.s[0], v0.s[1] | |||
ins v12.s[0], v0.s[2] | |||
ins v14.s[0], v0.s[3] | |||
ins v8.s[1], v1.s[0] | |||
ins v10.s[1], v1.s[1] | |||
ins v12.s[1], v1.s[2] | |||
ins v14.s[1], v1.s[3] | |||
ldr q2, [A03], #16 | |||
ldr q3, [A04], #16 | |||
ins v8.s[2], v2.s[0] | |||
ins v10.s[2], v2.s[1] | |||
ins v12.s[2], v2.s[2] | |||
ins v14.s[2], v2.s[3] | |||
ins v8.s[3], v3.s[0] | |||
ins v10.s[3], v3.s[1] | |||
ins v12.s[3], v3.s[2] | |||
ins v14.s[3], v3.s[3] | |||
zip1 v16.4s, v0.4s, v1.4s | |||
zip1 v17.4s, v2.4s, v3.4s | |||
zip2 v18.4s, v0.4s, v1.4s | |||
zip2 v19.4s, v2.4s, v3.4s | |||
zip1 v8.2d, v16.2d, v17.2d | |||
zip2 v10.2d, v16.2d, v17.2d | |||
zip1 v12.2d, v18.2d, v19.2d | |||
zip2 v14.2d, v18.2d, v19.2d | |||
ldr q4, [A05], #16 | |||
ldr q5, [A06], #16 | |||
ins v9.s[0], v4.s[0] | |||
ins v11.s[0], v4.s[1] | |||
ins v13.s[0], v4.s[2] | |||
ins v15.s[0], v4.s[3] | |||
ins v9.s[1], v5.s[0] | |||
ins v11.s[1], v5.s[1] | |||
ins v13.s[1], v5.s[2] | |||
ins v15.s[1], v5.s[3] | |||
ldr q6, [A07], #16 | |||
ldr q7, [A08], #16 | |||
ins v9.s[2], v6.s[0] | |||
ins v11.s[2], v6.s[1] | |||
ins v13.s[2], v6.s[2] | |||
ins v15.s[2], v6.s[3] | |||
ins v9.s[3], v7.s[0] | |||
ins v11.s[3], v7.s[1] | |||
ins v13.s[3], v7.s[2] | |||
ins v15.s[3], v7.s[3] | |||
zip1 v16.4s, v4.4s, v5.4s | |||
zip1 v17.4s, v6.4s, v7.4s | |||
zip2 v18.4s, v4.4s, v5.4s | |||
zip2 v19.4s, v6.4s, v7.4s | |||
zip1 v9.2d, v16.2d, v17.2d | |||
zip2 v11.2d, v16.2d, v17.2d | |||
zip1 v13.2d, v18.2d, v19.2d | |||
zip2 v15.2d, v18.2d, v19.2d | |||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 | |||
@@ -135,31 +121,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro COPY2x8 | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ins v8.s[0], v0.s[0] | |||
ins v10.s[0], v0.s[1] | |||
ins v8.s[1], v1.s[0] | |||
ins v10.s[1], v1.s[1] | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
ins v8.s[2], v2.s[0] | |||
ins v10.s[2], v2.s[1] | |||
ins v8.s[3], v3.s[0] | |||
ins v10.s[3], v3.s[1] | |||
zip1 v12.4s, v0.4s, v1.4s | |||
zip1 v13.4s, v2.4s, v3.4s | |||
zip1 v8.2d, v12.2d, v13.2d | |||
zip2 v10.2d, v12.2d, v13.2d | |||
ldr d4, [A05], #8 | |||
ldr d5, [A06], #8 | |||
ins v9.s[0], v4.s[0] | |||
ins v11.s[0], v4.s[1] | |||
ins v9.s[1], v5.s[0] | |||
ins v11.s[1], v5.s[1] | |||
ldr d6, [A07], #8 | |||
ldr d7, [A08], #8 | |||
ins v9.s[2], v6.s[0] | |||
ins v11.s[2], v6.s[1] | |||
ins v9.s[3], v7.s[0] | |||
ins v11.s[3], v7.s[1] | |||
zip1 v12.4s, v4.4s, v5.4s | |||
zip1 v13.4s, v6.4s, v7.4s | |||
zip1 v9.2d, v12.2d, v13.2d | |||
zip2 v11.2d, v12.2d, v13.2d | |||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
.endm | |||
@@ -191,25 +171,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro COPY4x4 | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
ldr q2, [A03], #16 | |||
ldr q3, [A04], #16 | |||
ins v8.s[2], v2.s[0] | |||
ins v9.s[2], v2.s[1] | |||
ins v10.s[2], v2.s[2] | |||
ins v11.s[2], v2.s[3] | |||
ins v8.s[3], v3.s[0] | |||
ins v9.s[3], v3.s[1] | |||
ins v10.s[3], v3.s[2] | |||
ins v11.s[3], v3.s[3] | |||
zip1 v12.4s, v0.4s, v1.4s | |||
zip1 v13.4s, v2.4s, v3.4s | |||
zip2 v14.4s, v0.4s, v1.4s | |||
zip2 v15.4s, v2.4s, v3.4s | |||
zip1 v8.2d, v12.2d, v13.2d | |||
zip2 v9.2d, v12.2d, v13.2d | |||
zip1 v10.2d, v14.2d, v15.2d | |||
zip2 v11.2d, v14.2d, v15.2d | |||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
.endm | |||
@@ -217,17 +190,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro COPY2x4 | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
ins v8.s[2], v2.s[0] | |||
ins v9.s[2], v2.s[1] | |||
ins v8.s[3], v3.s[0] | |||
ins v9.s[3], v3.s[1] | |||
zip1 v10.4s, v0.4s, v1.4s | |||
zip1 v11.4s, v2.4s, v3.4s | |||
zip1 v8.2d, v10.2d, v11.2d | |||
zip2 v9.2d, v10.2d, v11.2d | |||
st1 {v8.4s, v9.4s}, [B00], #32 | |||
.endm | |||
@@ -249,14 +219,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro COPY4x2 | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v10.s[0], v0.s[2] | |||
ins v11.s[0], v0.s[3] | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
ins v10.s[1], v1.s[2] | |||
ins v11.s[1], v1.s[3] | |||
zip1 v12.4s, v0.4s, v1.4s | |||
zip2 v13.4s, v0.4s, v1.4s | |||
dup v8.2d, v12.d[0] | |||
dup v9.2d, v12.d[1] | |||
dup v10.2d, v13.d[0] | |||
dup v11.2d , v13.d[1] | |||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 | |||
.endm | |||
@@ -264,10 +234,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro COPY2x2 | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ins v8.s[0], v0.s[0] | |||
ins v9.s[0], v0.s[1] | |||
ins v8.s[1], v1.s[0] | |||
ins v9.s[1], v1.s[1] | |||
zip1 v8.2s, v0.2s, v1.2s | |||
zip2 v9.2s, v0.2s, v1.2s | |||
st1 {v8.2s, v9.2s}, [B00], #16 | |||
.endm | |||
@@ -222,7 +222,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n8 = N & -8; | |||
const BLASLONG n4 = N & -4; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -223,7 +223,7 @@ CNAME(BLASLONG M, | |||
const BLASLONG n8 = N & -8; | |||
const BLASLONG n4 = N & -4; | |||
const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||
FLOAT* packed_a = | |||
(pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
@@ -0,0 +1,219 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_neon.h> | |||
#include "common.h" | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT temp; | |||
ix = 0; | |||
a_ptr = a; | |||
if (inc_x == 1 && inc_y == 1) { | |||
FLOAT *a0_ptr = a + lda * 0; | |||
FLOAT *a1_ptr = a + lda * 1; | |||
FLOAT *a2_ptr = a + lda * 2; | |||
FLOAT *a3_ptr = a + lda * 3; | |||
FLOAT *a4_ptr = a + lda * 4; | |||
FLOAT *a5_ptr = a + lda * 5; | |||
FLOAT *a6_ptr = a + lda * 6; | |||
FLOAT *a7_ptr = a + lda * 7; | |||
j = 0; | |||
while (j + 3 < n) { | |||
float32x4_t x0_vec = vld1q_f32(x + j); | |||
x0_vec = vmulq_n_f32(x0_vec, alpha); | |||
i = 0; | |||
while (i + 7 < m) { | |||
float32x4_t a00_vec = vld1q_f32(a0_ptr + i); | |||
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); | |||
float32x4_t a10_vec = vld1q_f32(a1_ptr + i); | |||
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); | |||
float32x4_t a20_vec = vld1q_f32(a2_ptr + i); | |||
float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4); | |||
float32x4_t a30_vec = vld1q_f32(a3_ptr + i); | |||
float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4); | |||
float32x4_t y0_vec = vld1q_f32(y + i); | |||
float32x4_t y1_vec = vld1q_f32(y + i + 4); | |||
y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0); | |||
y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1); | |||
y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2); | |||
y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3); | |||
y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0); | |||
y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1); | |||
y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2); | |||
y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3); | |||
vst1q_f32(y + i, y0_vec); | |||
vst1q_f32(y + i + 4, y1_vec); | |||
i += 8; | |||
} | |||
while (i + 3 < m) { | |||
float32x4_t a0_vec = vld1q_f32(a0_ptr + i); | |||
float32x4_t a1_vec = vld1q_f32(a1_ptr + i); | |||
float32x4_t a2_vec = vld1q_f32(a2_ptr + i); | |||
float32x4_t a3_vec = vld1q_f32(a3_ptr + i); | |||
float32x4_t y_vec = vld1q_f32(y + i); | |||
y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0); | |||
y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1); | |||
y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2); | |||
y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3); | |||
vst1q_f32(y + i, y_vec); | |||
i += 4; | |||
} | |||
while (i + 1 < m) { | |||
float32x2_t a0_vec = vld1_f32(a0_ptr + i); | |||
float32x2_t a1_vec = vld1_f32(a1_ptr + i); | |||
float32x2_t a2_vec = vld1_f32(a2_ptr + i); | |||
float32x2_t a3_vec = vld1_f32(a3_ptr + i); | |||
float32x2_t y_vec = vld1_f32(y + i); | |||
y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0); | |||
y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1); | |||
y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2); | |||
y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3); | |||
vst1_f32(y + i, y_vec); | |||
i += 2; | |||
} | |||
while (i < m) { | |||
y[i] += a0_ptr[i] * x0_vec[0]; | |||
y[i] += a1_ptr[i] * x0_vec[1]; | |||
y[i] += a2_ptr[i] * x0_vec[2]; | |||
y[i] += a3_ptr[i] * x0_vec[3]; | |||
i++; | |||
} | |||
a0_ptr += lda * 4; | |||
a1_ptr += lda * 4; | |||
a2_ptr += lda * 4; | |||
a3_ptr += lda * 4; | |||
j += 4; | |||
} | |||
while (j + 1 < n) { | |||
float32x2_t x0_vec = vld1_f32(x + j); | |||
x0_vec = vmul_n_f32(x0_vec, alpha); | |||
i = 0; | |||
while (i + 7 < m) { | |||
float32x4_t a00_vec = vld1q_f32(a0_ptr + i); | |||
float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); | |||
float32x4_t a10_vec = vld1q_f32(a1_ptr + i); | |||
float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); | |||
float32x4_t y0_vec = vld1q_f32(y + i); | |||
float32x4_t y1_vec = vld1q_f32(y + i + 4); | |||
y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0); | |||
y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1); | |||
y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0); | |||
y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1); | |||
vst1q_f32(y + i, y0_vec); | |||
vst1q_f32(y + i + 4, y1_vec); | |||
i += 8; | |||
} | |||
while (i + 3 < m) { | |||
float32x4_t a0_vec = vld1q_f32(a0_ptr + i); | |||
float32x4_t a1_vec = vld1q_f32(a1_ptr + i); | |||
float32x4_t y_vec = vld1q_f32(y + i); | |||
y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0); | |||
y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1); | |||
vst1q_f32(y + i, y_vec); | |||
i += 4; | |||
} | |||
while (i + 1 < m) { | |||
float32x2_t a0_vec = vld1_f32(a0_ptr + i); | |||
float32x2_t a1_vec = vld1_f32(a1_ptr + i); | |||
float32x2_t y_vec = vld1_f32(y + i); | |||
y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0); | |||
y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1); | |||
vst1_f32(y + i, y_vec); | |||
i += 2; | |||
} | |||
while (i < m) { | |||
y[i] += a0_ptr[i] * x0_vec[0]; | |||
y[i] += a1_ptr[i] * x0_vec[1]; | |||
i++; | |||
} | |||
a0_ptr += lda * 2; | |||
a1_ptr += lda * 2; | |||
j += 2; | |||
} | |||
while (j < n) { | |||
i = 0; | |||
temp = alpha * x[j]; | |||
while (i < m) { | |||
y[i] += a0_ptr[i] * temp; | |||
i++; | |||
} | |||
a0_ptr += lda; | |||
j++; | |||
} | |||
return (0); | |||
} | |||
for (j = 0; j < n; j++) { | |||
temp = alpha * x[ix]; | |||
iy = 0; | |||
for (i = 0; i < m; i++) { | |||
y[iy] += temp * a_ptr[i]; | |||
iy += inc_y; | |||
} | |||
a_ptr += lda; | |||
ix += inc_x; | |||
} | |||
return (0); | |||
} |