Update from develop branch for 0.3.19 releasetags/v0.3.19
@@ -3,10 +3,13 @@ | |||
## | |||
cmake_minimum_required(VERSION 2.8.5) | |||
project(OpenBLAS C ASM) | |||
set(OpenBLAS_MAJOR_VERSION 0) | |||
set(OpenBLAS_MINOR_VERSION 3) | |||
set(OpenBLAS_PATCH_VERSION 19) | |||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
# Adhere to GNU filesystem layout conventions | |||
@@ -20,51 +23,68 @@ endif() | |||
####### | |||
if(MSVC) | |||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
endif() | |||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | |||
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) | |||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
else() | |||
set(NO_AFFINITY 1) | |||
set(NO_AFFINITY 1) | |||
endif() | |||
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) | |||
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
option(BUILD_STATIC_LIBS "Build static library" OFF) | |||
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | |||
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
endif() | |||
if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC) | |||
message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS") | |||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE) | |||
endif() | |||
# Add a prefix or suffix to all exported symbol names in the shared library. | |||
# Avoids conflicts with other BLAS libraries, especially when using | |||
# 64 bit integer interfaces in OpenBLAS. | |||
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) | |||
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
####### | |||
if(BUILD_WITHOUT_LAPACK) | |||
set(NO_LAPACK 1) | |||
set(NO_LAPACKE 1) | |||
set(NO_LAPACK 1) | |||
set(NO_LAPACKE 1) | |||
endif() | |||
if(BUILD_WITHOUT_CBLAS) | |||
set(NO_CBLAS 1) | |||
set(NO_CBLAS 1) | |||
endif() | |||
####### | |||
if(MSVC AND MSVC_STATIC_CRT) | |||
set(CompilerFlags | |||
CMAKE_CXX_FLAGS | |||
CMAKE_CXX_FLAGS_DEBUG | |||
CMAKE_CXX_FLAGS_RELEASE | |||
CMAKE_C_FLAGS | |||
CMAKE_C_FLAGS_DEBUG | |||
CMAKE_C_FLAGS_RELEASE | |||
) | |||
foreach(CompilerFlag ${CompilerFlags}) | |||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
endforeach() | |||
set(CompilerFlags | |||
CMAKE_CXX_FLAGS | |||
CMAKE_CXX_FLAGS_DEBUG | |||
CMAKE_CXX_FLAGS_RELEASE | |||
CMAKE_C_FLAGS | |||
CMAKE_C_FLAGS_DEBUG | |||
CMAKE_C_FLAGS_RELEASE | |||
) | |||
foreach(CompilerFlag ${CompilerFlags}) | |||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
endforeach() | |||
endif() | |||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
@@ -98,7 +118,7 @@ endif () | |||
# set which float types we want to build for | |||
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
# if none are defined, build for all | |||
# set(BUILD_BFLOAT16 true) | |||
# set(BUILD_BFLOAT16 true) | |||
set(BUILD_SINGLE true) | |||
set(BUILD_DOUBLE true) | |||
set(BUILD_COMPLEX true) | |||
@@ -143,9 +163,10 @@ endif () | |||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
if(MSVC) | |||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||
endif () | |||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) | |||
set(TARGET_OBJS "") | |||
foreach (SUBDIR ${SUBDIRS}) | |||
@@ -183,12 +204,61 @@ if (${DYNAMIC_ARCH}) | |||
endif () | |||
# add objects to the openblas lib | |||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
if(NOT NO_LAPACK) | |||
add_library(LAPACK OBJECT ${LA_SOURCES}) | |||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>") | |||
endif() | |||
if(NOT NO_LAPACKE) | |||
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) | |||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>") | |||
endif() | |||
if(BUILD_RELAPACK) | |||
add_library(RELAPACK OBJECT ${RELA_SOURCES}) | |||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>") | |||
endif() | |||
set(OpenBLAS_LIBS "") | |||
if(BUILD_STATIC_LIBS) | |||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static) | |||
endif() | |||
if(BUILD_SHARED_LIBS) | |||
add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared) | |||
endif() | |||
if(BUILD_STATIC_LIBS) | |||
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static) | |||
else() | |||
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared) | |||
endif() | |||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) | |||
# Android needs to explicitly link against libm | |||
if(ANDROID) | |||
target_link_libraries(${OpenBLAS_LIBNAME} m) | |||
if(BUILD_STATIC_LIBS) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_static m) | |||
endif() | |||
if(BUILD_SHARED_LIBS) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_shared m) | |||
endif() | |||
endif() | |||
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) | |||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
if (NOT NOFORTRAN) | |||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
else () | |||
set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
endif () | |||
endif() | |||
# Handle MSVC exports | |||
@@ -197,21 +267,21 @@ if(MSVC AND BUILD_SHARED_LIBS) | |||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
else() | |||
# Creates verbose .def file (51KB vs 18KB) | |||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) | |||
set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) | |||
endif() | |||
endif() | |||
# Set output for libopenblas | |||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | |||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | |||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
endforeach() | |||
enable_testing() | |||
@@ -220,10 +290,17 @@ if (USE_THREAD) | |||
# Add threading library to linker | |||
find_package(Threads) | |||
if (THREADS_HAVE_PTHREAD_ARG) | |||
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") | |||
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") | |||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
COMPILE_OPTIONS "-pthread" | |||
INTERFACE_COMPILE_OPTIONS "-pthread" | |||
) | |||
endif() | |||
if(BUILD_STATIC_LIBS) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT}) | |||
endif() | |||
if(BUILD_SHARED_LIBS) | |||
target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT}) | |||
endif() | |||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) | |||
endif() | |||
#if (MSVC OR NOT NOFORTRAN) | |||
@@ -239,97 +316,109 @@ if (NOT NOFORTRAN) | |||
add_subdirectory(ctest) | |||
endif() | |||
add_subdirectory(lapack-netlib/TESTING) | |||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
add_subdirectory(cpp_thread_test) | |||
endif() | |||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
add_subdirectory(cpp_thread_test) | |||
endif() | |||
endif() | |||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} | |||
SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
) | |||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
if (NOT MSVC) | |||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") | |||
else() | |||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
endif() | |||
endif() | |||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
if (NOT DEFINED ARCH) | |||
set(ARCH_IN "x86_64") | |||
else() | |||
set(ARCH_IN ${ARCH}) | |||
endif() | |||
if (NOT DEFINED ARCH) | |||
set(ARCH_IN "x86_64") | |||
else() | |||
set(ARCH_IN ${ARCH}) | |||
endif() | |||
if (${CORE} STREQUAL "generic") | |||
set(ARCH_IN "GENERIC") | |||
endif () | |||
if (${CORE} STREQUAL "generic") | |||
set(ARCH_IN "GENERIC") | |||
endif () | |||
if (NOT DEFINED EXPRECISION) | |||
set(EXPRECISION_IN 0) | |||
else() | |||
set(EXPRECISION_IN ${EXPRECISION}) | |||
endif() | |||
if (NOT DEFINED EXPRECISION) | |||
set(EXPRECISION_IN 0) | |||
else() | |||
set(EXPRECISION_IN ${EXPRECISION}) | |||
endif() | |||
if (NOT DEFINED NO_CBLAS) | |||
set(NO_CBLAS_IN 0) | |||
else() | |||
set(NO_CBLAS_IN ${NO_CBLAS}) | |||
endif() | |||
if (NOT DEFINED NO_CBLAS) | |||
set(NO_CBLAS_IN 0) | |||
else() | |||
set(NO_CBLAS_IN ${NO_CBLAS}) | |||
endif() | |||
if (NOT DEFINED NO_LAPACK) | |||
set(NO_LAPACK_IN 0) | |||
else() | |||
set(NO_LAPACK_IN ${NO_LAPACK}) | |||
endif() | |||
if (NOT DEFINED NO_LAPACK) | |||
set(NO_LAPACK_IN 0) | |||
else() | |||
set(NO_LAPACK_IN ${NO_LAPACK}) | |||
endif() | |||
if (NOT DEFINED NO_LAPACKE) | |||
set(NO_LAPACKE_IN 0) | |||
else() | |||
set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
endif() | |||
if (NOT DEFINED NO_LAPACKE) | |||
set(NO_LAPACKE_IN 0) | |||
else() | |||
set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
endif() | |||
if (NOT DEFINED NEED2UNDERSCORES) | |||
set(NEED2UNDERSCORES_IN 0) | |||
else() | |||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
endif() | |||
if (NOT DEFINED NEED2UNDERSCORES) | |||
set(NEED2UNDERSCORES_IN 0) | |||
else() | |||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
endif() | |||
if (NOT DEFINED ONLY_CBLAS) | |||
set(ONLY_CBLAS_IN 0) | |||
else() | |||
set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
endif() | |||
if (NOT DEFINED ONLY_CBLAS) | |||
set(ONLY_CBLAS_IN 0) | |||
else() | |||
set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
endif() | |||
if (NOT DEFINED BU) | |||
set(BU _) | |||
endif() | |||
if (NOT DEFINED BU) | |||
set(BU _) | |||
endif() | |||
if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
endif() | |||
if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
endif() | |||
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD | |||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
COMMENT "renaming symbols" | |||
) | |||
if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
endif() | |||
if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
endif() | |||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
COMMENT "renaming symbols" | |||
) | |||
endif() | |||
# Install project | |||
# Install libraries | |||
install(TARGETS ${OpenBLAS_LIBNAME} | |||
EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) | |||
install(TARGETS ${OpenBLAS_LIBNAME}_shared | |||
EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
install(TARGETS ${OpenBLAS_LIBNAME}_static | |||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
else() | |||
install(TARGETS ${OpenBLAS_LIBS} | |||
EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
endif() | |||
# Install headers | |||
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
@@ -365,36 +454,41 @@ if(NOT NOFORTRAN) | |||
endif() | |||
if(NOT NO_CBLAS) | |||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
endif() | |||
if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
endif() | |||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
endif() | |||
if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
endif() | |||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
endif() | |||
if(NOT NO_LAPACKE) | |||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) | |||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
ADD_CUSTOM_TARGET(genlapacke | |||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
) | |||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||
if(BUILD_STATIC_LIBS) | |||
add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) | |||
endif() | |||
if(BUILD_SHARED_LIBS) | |||
add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) | |||
endif() | |||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
ADD_CUSTOM_TARGET(genlapacke | |||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
) | |||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
endif() | |||
# Install pkg-config files | |||
@@ -419,4 +513,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
install(EXPORT "${PN}${SUFFIX64}Targets" | |||
NAMESPACE "${PN}${SUFFIX64}::" | |||
DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
@@ -197,3 +197,7 @@ In chronological order: | |||
* River Dillon <oss@outerpassage.net> | |||
* [2021-07-10] fix compilation with musl libc | |||
* Bine Brank <https://github.com/binebrank> | |||
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | |||
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM |
@@ -1,4 +1,51 @@ | |||
OpenBLAS ChangeLog | |||
==================================================================== | |||
Version 0.3.19 | |||
19-Dec-2021 | |||
general: | |||
- reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16 | |||
- fixed a potential thread race in the thread buffer reallocation routines | |||
that were introduced in 0.3.18 | |||
- fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE | |||
- fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG | |||
- made automatic library suffix for CMAKE builds with INTERFACE64 available | |||
to CBLAS-only builds | |||
x86_64: | |||
- DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities | |||
when an unknown CPUID is encountered, instead of defaulting to Prescott | |||
- added cpu detection for Intel Alder Lake | |||
- added cpu detection for Intel Sapphire Rapids | |||
- added an optimized SBGEMM kernel for Sapphire Rapids | |||
- fixed DYNAMIC_ARCH builds on OSX with CMAKE | |||
- worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX | |||
- fixed missing thread initialization for static builds on Windows/MSVC | |||
- fixed an excessive read in ZSYMV | |||
POWER: | |||
- added support for POWER10 in big-endian mode | |||
- added support for building with CMAKE | |||
- added optimized SGEMM and DGEMM kernels for small matrix sizes | |||
ARMV8: | |||
- added basic support and cputype detection for Fujitsu A64FX | |||
- added a generic ARMV8SVE target | |||
- added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX | |||
- added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus | |||
- fixed cpuid detection for Apple M1 and improved performance | |||
- improved compiler flag setting in CMAKE builds | |||
RISCV64: | |||
- fixed improper initialization in CSCAL/ZSCAL for strided access patterns | |||
MIPS: | |||
- added a GENERIC target for MIPS32 | |||
- added support for cross-compiling to MIPS32 on x86_64 using CMAKE | |||
MIPS64: | |||
- fixed misdetection of MSA capability | |||
==================================================================== | |||
Version 0.3.18 | |||
02-Oct-2021 | |||
@@ -32,7 +32,7 @@ export NOFORTRAN | |||
export NO_LAPACK | |||
endif | |||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | |||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | |||
@@ -1,6 +1,9 @@ | |||
ifneq ($(C_COMPILER), PGI) | |||
ifneq ($(GCCVERSIONGT4), 1) | |||
ifeq ($(C_COMPILER), CLANG) | |||
ISCLANG=1 | |||
endif | |||
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) | |||
CCOMMON_OPT += -march=armv8-a | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=armv8-a | |||
@@ -17,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a | |||
endif | |||
endif | |||
ifeq ($(CORE), ARMV8SVE) | |||
CCOMMON_OPT += -march=armv8-a+sve | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=armv8-a+sve | |||
endif | |||
endif | |||
ifeq ($(CORE), CORTEXA53) | |||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
ifneq ($(F_COMPILER), NAG) | |||
@@ -48,7 +58,7 @@ endif | |||
# Use a72 tunings because Neoverse-N1 is only available | |||
# in GCC>=9 | |||
ifeq ($(CORE), NEOVERSEN1) | |||
ifeq ($(GCCVERSIONGTEQ7), 1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
ifeq ($(GCCVERSIONGTEQ9), 1) | |||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
ifneq ($(F_COMPILER), NAG) | |||
@@ -70,7 +80,7 @@ endif | |||
# Use a53 tunings because a55 is only available in GCC>=8.1 | |||
ifeq ($(CORE), CORTEXA55) | |||
ifeq ($(GCCVERSIONGTEQ7), 1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
ifeq ($(GCCVERSIONGTEQ8), 1) | |||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
ifneq ($(F_COMPILER), NAG) | |||
@@ -132,7 +142,7 @@ FCOMMON_OPT += -march=armv8.3-a | |||
endif | |||
endif | |||
ifeq ($(GCCVERSIONGTEQ9), 1) | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | |||
ifeq ($(CORE), TSV110) | |||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
ifneq ($(F_COMPILER), NAG) | |||
@@ -150,6 +160,15 @@ endif | |||
endif | |||
endif | |||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
ifeq ($(CORE), A64FX) | |||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
endif | |||
endif | |||
endif | |||
endif | |||
endif | |||
endif |
@@ -3,7 +3,7 @@ | |||
# | |||
# This library's version | |||
VERSION = 0.3.18 | |||
VERSION = 0.3.18.dev | |||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
@@ -9,11 +9,10 @@ ifndef TOPDIR | |||
TOPDIR = . | |||
endif | |||
# If ARCH is not set, we use the host system's architecture for getarch compile options. | |||
ifndef ARCH | |||
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling | |||
HOSTARCH := $(shell uname -m) | |||
else | |||
HOSTARCH = $(ARCH) | |||
ifeq ($(HOSTARCH), amd64) | |||
HOSTARCH=x86_64 | |||
endif | |||
# Catch conflicting usage of ARCH in some BSD environments | |||
@@ -102,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET | |||
ifeq ($(TARGET), GENERIC) | |||
ifeq ($(DYNAMIC_ARCH), 1) | |||
override NO_EXPRECISION=1 | |||
export NO_EXPRECiSION | |||
export NO_EXPRECISION | |||
endif | |||
endif | |||
endif | |||
@@ -119,6 +118,9 @@ endif | |||
ifeq ($(TARGET), COOPERLAKE) | |||
GETARCH_FLAGS := -DFORCE_NEHALEM | |||
endif | |||
ifeq ($(TARGET), SAPPHIRERAPIDS) | |||
GETARCH_FLAGS := -DFORCE_NEHALEM | |||
endif | |||
ifeq ($(TARGET), SANDYBRIDGE) | |||
GETARCH_FLAGS := -DFORCE_NEHALEM | |||
endif | |||
@@ -143,8 +145,13 @@ endif | |||
ifeq ($(TARGET), POWER8) | |||
GETARCH_FLAGS := -DFORCE_POWER6 | |||
endif | |||
ifeq ($(TARGET), POWER9) | |||
GETARCH_FLAGS := -DFORCE_POWER6 | |||
endif | |||
ifeq ($(TARGET), POWER10) | |||
GETARCH_FLAGS := -DFORCE_POWER6 | |||
endif | |||
endif | |||
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | |||
# | |||
@@ -164,6 +171,9 @@ endif | |||
ifeq ($(TARGET_CORE), COOPERLAKE) | |||
GETARCH_FLAGS := -DFORCE_NEHALEM | |||
endif | |||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
GETARCH_FLAGS := -DFORCE_NEHALEM | |||
endif | |||
ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
GETARCH_FLAGS := -DFORCE_NEHALEM | |||
endif | |||
@@ -251,6 +261,8 @@ endif | |||
#For small matrix optimization | |||
ifeq ($(ARCH), x86_64) | |||
SMALL_MATRIX_OPT = 1 | |||
else ifeq ($(CORE), POWER10) | |||
SMALL_MATRIX_OPT = 1 | |||
endif | |||
ifeq ($(SMALL_MATRIX_OPT), 1) | |||
CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||
@@ -260,6 +272,10 @@ endif | |||
ifndef GOTOBLAS_MAKEFILE | |||
export GOTOBLAS_MAKEFILE = 1 | |||
# Determine if the assembler is GNU Assembler | |||
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) | |||
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) | |||
# Generating Makefile.conf and config.h | |||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
@@ -307,7 +323,7 @@ else | |||
SMP = 1 | |||
endif | |||
else | |||
ifeq ($(NUM_THREAD), 1) | |||
ifeq ($(NUM_THREADS), 1) | |||
SMP = | |||
else | |||
SMP = 1 | |||
@@ -892,15 +908,25 @@ endif | |||
ifeq ($(C_COMPILER), PGI) | |||
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) | |||
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) | |||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) | |||
PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) | |||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) | |||
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) | |||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) | |||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) | |||
NEWPGI := 1 | |||
PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) | |||
PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) | |||
PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) | |||
ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) | |||
NEWPGI2 := 1 | |||
endif | |||
endif | |||
ifdef BINARY64 | |||
ifeq ($(ARCH), x86_64) | |||
ifneq ($(NEWPGI2),1) | |||
CCOMMON_OPT += -tp p7-64 | |||
else | |||
CCOMMON_OPT += -tp px | |||
endif | |||
ifneq ($(NEWPGI),1) | |||
CCOMMON_OPT += -D__MMX__ -Mnollvm | |||
endif | |||
@@ -915,7 +941,11 @@ endif | |||
endif | |||
endif | |||
else | |||
ifneq ($(NEWPGI2),1) | |||
CCOMMON_OPT += -tp p7 | |||
else | |||
CCOMMON_OPT += -tp px | |||
endif | |||
endif | |||
endif | |||
@@ -1092,8 +1122,12 @@ FCOMMON_OPT += -i8 | |||
endif | |||
endif | |||
ifeq ($(ARCH), x86_64) | |||
ifneq ($(NEWPGI2),1) | |||
FCOMMON_OPT += -tp p7-64 | |||
else | |||
FCOMMON_OPT += -tp px | |||
endif | |||
else | |||
ifeq ($(ARCH), power) | |||
ifeq ($(CORE), POWER6) | |||
$(warning NVIDIA HPC compilers do not support POWER6.) | |||
@@ -1643,8 +1677,10 @@ export HAVE_VFP | |||
export HAVE_VFPV3 | |||
export HAVE_VFPV4 | |||
export HAVE_NEON | |||
export HAVE_MSA | |||
export MSA_FLAGS | |||
ifndef NO_MSA | |||
export HAVE_MSA | |||
export MSA_FLAGS | |||
endif | |||
export KERNELDIR | |||
export FUNCTION_PROFILE | |||
export TARGET_CORE | |||
@@ -81,6 +81,40 @@ CCOMMON_OPT += -march=cooperlake | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=cooperlake | |||
endif | |||
else # gcc not support, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
endif | |||
ifeq ($(OSNAME), CYGWIN_NT) | |||
CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
endif | |||
ifeq ($(OSNAME), WINNT) | |||
ifeq ($(C_COMPILER), GCC) | |||
CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
endif | |||
endif | |||
endif | |||
endif | |||
ifeq ($(CORE), SAPPHIRERAPIDS) | |||
ifndef NO_AVX512 | |||
ifeq ($(C_COMPILER), GCC) | |||
# sapphire rapids support was added in 11 | |||
ifeq ($(GCCVERSIONGTEQ11), 1) | |||
CCOMMON_OPT += -march=sapphirerapids | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=sapphirerapids | |||
endif | |||
else # gcc not support, fallback to avx512 | |||
CCOMMON_OPT += -march=skylake-avx512 | |||
ifneq ($(F_COMPILER), NAG) | |||
FCOMMON_OPT += -march=skylake-avx512 | |||
endif | |||
endif | |||
endif | |||
ifeq ($(OSNAME), CYGWIN_NT) | |||
@@ -23,6 +23,7 @@ HASWELL | |||
SKYLAKEX | |||
ATOM | |||
COOPERLAKE | |||
SAPPHIRERAPIDS | |||
b)AMD CPU: | |||
ATHLON | |||
@@ -29,15 +29,15 @@ environment: | |||
global: | |||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | |||
matrix: | |||
- COMPILER: clang-cl | |||
WITH_FORTRAN: ON | |||
- COMPILER: clang-cl | |||
DYNAMIC_ARCH: ON | |||
WITH_FORTRAN: OFF | |||
- COMPILER: cl | |||
- COMPILER: MinGW64-gcc-7.2.0-mingw | |||
DYNAMIC_ARCH: OFF | |||
WITH_FORTRAN: ignore | |||
# - COMPILER: clang-cl | |||
# WITH_FORTRAN: ON | |||
# - COMPILER: clang-cl | |||
# DYNAMIC_ARCH: ON | |||
# WITH_FORTRAN: OFF | |||
# - COMPILER: cl | |||
# - COMPILER: MinGW64-gcc-7.2.0-mingw | |||
# DYNAMIC_ARCH: OFF | |||
# WITH_FORTRAN: ignore | |||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||
COMPILER: MinGW-gcc-6.3.0-32 | |||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||
@@ -46,6 +46,7 @@ environment: | |||
install: | |||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | |||
- if [%COMPILER%]==[clang-cl] conda update --yes -n base conda | |||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false | |||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||
@@ -64,8 +65,8 @@ before_build: | |||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | |||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON .. | |||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. | |||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
@@ -75,7 +75,50 @@ jobs: | |||
cd utest | |||
dir | |||
openblas_utest.exe | |||
- job: Windows_mingw_gmake | |||
pool: | |||
vmImage: 'windows-latest' | |||
steps: | |||
- script: | | |||
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" | |||
- job: Windows_clang_cmake | |||
pool: | |||
vmImage: 'windows-latest' | |||
steps: | |||
- script: | | |||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
set "CPATH=C:\Miniconda\Library\include;%CPATH% | |||
conda config --add channels conda-forge --force | |||
conda config --set auto_update_conda false | |||
conda install --yes ninja | |||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
mkdir build | |||
cd build | |||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON .. | |||
cmake --build . --config Release | |||
ctest | |||
- job: Windows_flang_clang | |||
pool: | |||
vmImage: 'windows-latest' | |||
steps: | |||
- script: | | |||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
set "CPATH=C:\Miniconda\Library\include;%CPATH%" | |||
conda config --add channels conda-forge --force | |||
conda config --set auto_update_conda false | |||
conda install --yes --quiet ninja flang | |||
mkdir build | |||
cd build | |||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
cmake --build . --config Release | |||
ctest | |||
- job: OSX_OpenMP | |||
pool: | |||
vmImage: 'macOS-10.15' | |||
@@ -122,7 +165,7 @@ jobs: | |||
make | |||
ctest | |||
- job: OSX_OpenMP_Clang_gf_cmake | |||
- job: OSX_dynarch_cmake | |||
pool: | |||
vmImage: 'macOS-10.15' | |||
variables: | |||
@@ -130,14 +173,12 @@ jobs: | |||
LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
steps: | |||
- script: | | |||
brew update | |||
brew install llvm libomp | |||
mkdir build | |||
cd build | |||
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. | |||
make | |||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. | |||
cmake --build . | |||
ctest | |||
- job: OSX_Ifort_Clang | |||
pool: | |||
vmImage: 'macOS-10.15' | |||
@@ -179,7 +220,7 @@ jobs: | |||
brew update | |||
brew install --cask android-ndk | |||
export ANDROID_NDK_HOME=/usr/local/share/android-ndk | |||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | |||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | |||
- job: OSX_IOS_ARMV8 | |||
pool: | |||
@@ -206,9 +247,9 @@ jobs: | |||
vmImage: 'ubuntu-latest' | |||
steps: | |||
- script: | | |||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ | |||
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ | |||
|| exit 1 | |||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ | |||
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ | |||
|| exit 1 | |||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | |||
alpine make DYNAMIC_ARCH=1 BINARY=64 | |||
@@ -125,7 +125,7 @@ int main(int argc, char *argv[]){ | |||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
for(j = 0; j < m; j++){ | |||
for(i = 0; i < n * COMPSIZE; i++){ | |||
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
} | |||
} | |||
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ | |||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
for(j = 0; j < m; j++){ | |||
for(i = 0; i < n * COMPSIZE; i++){ | |||
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
} | |||
} | |||
@@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64") | |||
endif () | |||
endif () | |||
if (MIPS64) | |||
if (MIPS32 OR MIPS64) | |||
set(NO_BINARY_MODE 1) | |||
endif () | |||
@@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS | |||
if (NO_BINARY_MODE) | |||
if (MIPS32) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32") | |||
set(BINARY_DEFINED 1) | |||
endif () | |||
if (MIPS64) | |||
if (BINARY64) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") | |||
@@ -126,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE) | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL SAPPHIRERAPIDS) | |||
if (NOT DYNAMIC_ARCH) | |||
if (NOT NO_AVX512) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") | |||
endif() | |||
endif () | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL A64FX) | |||
if (NOT DYNAMIC_ARCH) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
endif() | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL ARMV8SVE) | |||
if (NOT DYNAMIC_ARCH) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL POWER10) | |||
if (NOT DYNAMIC_ARCH) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | |||
else () | |||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) | |||
endif() | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL POWER9) | |||
if (NOT DYNAMIC_ARCH) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") | |||
else () | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") | |||
endif () | |||
endif () | |||
endif () | |||
if (${CORE} STREQUAL POWER8) | |||
if (NOT DYNAMIC_ARCH) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
endif () | |||
endif () | |||
if (NOT DYNAMIC_ARCH) | |||
if (HAVE_AVX2) | |||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") | |||
@@ -3,11 +3,6 @@ | |||
## Description: Ported from portion of OpenBLAS/Makefile.system | |||
## Sets Fortran related variables. | |||
if (INTERFACE64) | |||
set(SUFFIX64 64) | |||
set(SUFFIX64_UNDERSCORE _64) | |||
endif() | |||
if (${F_COMPILER} STREQUAL "FLANG") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
if (BINARY64 AND INTERFACE64) | |||
@@ -1,214 +1,218 @@ | |||
# helper functions for the kernel CMakeLists.txt | |||
function(SetFallback KERNEL SOURCE_PATH) | |||
if (NOT (DEFINED ${KERNEL})) | |||
set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE) | |||
endif () | |||
endfunction() | |||
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. | |||
macro(SetDefaultL1) | |||
set(SAMAXKERNEL amax.S) | |||
set(DAMAXKERNEL amax.S) | |||
set(QAMAXKERNEL amax.S) | |||
set(CAMAXKERNEL zamax.S) | |||
set(ZAMAXKERNEL zamax.S) | |||
set(XAMAXKERNEL zamax.S) | |||
set(SAMINKERNEL amin.S) | |||
set(DAMINKERNEL amin.S) | |||
set(QAMINKERNEL amin.S) | |||
set(CAMINKERNEL zamin.S) | |||
set(ZAMINKERNEL zamin.S) | |||
set(XAMINKERNEL zamin.S) | |||
set(SMAXKERNEL max.S) | |||
set(DMAXKERNEL max.S) | |||
set(QMAXKERNEL max.S) | |||
set(SMINKERNEL min.S) | |||
set(DMINKERNEL min.S) | |||
set(QMINKERNEL min.S) | |||
set(ISAMAXKERNEL iamax.S) | |||
set(IDAMAXKERNEL iamax.S) | |||
set(IQAMAXKERNEL iamax.S) | |||
set(ICAMAXKERNEL izamax.S) | |||
set(IZAMAXKERNEL izamax.S) | |||
set(IXAMAXKERNEL izamax.S) | |||
set(ISAMINKERNEL iamin.S) | |||
set(IDAMINKERNEL iamin.S) | |||
set(IQAMINKERNEL iamin.S) | |||
set(ICAMINKERNEL izamin.S) | |||
set(IZAMINKERNEL izamin.S) | |||
set(IXAMINKERNEL izamin.S) | |||
set(ISMAXKERNEL iamax.S) | |||
set(IDMAXKERNEL iamax.S) | |||
set(IQMAXKERNEL iamax.S) | |||
set(ISMINKERNEL iamin.S) | |||
set(IDMINKERNEL iamin.S) | |||
set(IQMINKERNEL iamin.S) | |||
set(SASUMKERNEL asum.S) | |||
set(DASUMKERNEL asum.S) | |||
set(CASUMKERNEL zasum.S) | |||
set(ZASUMKERNEL zasum.S) | |||
set(QASUMKERNEL asum.S) | |||
set(XASUMKERNEL zasum.S) | |||
set(SAXPYKERNEL axpy.S) | |||
set(DAXPYKERNEL axpy.S) | |||
set(CAXPYKERNEL zaxpy.S) | |||
set(ZAXPYKERNEL zaxpy.S) | |||
set(QAXPYKERNEL axpy.S) | |||
set(XAXPYKERNEL zaxpy.S) | |||
set(SCOPYKERNEL copy.S) | |||
set(DCOPYKERNEL copy.S) | |||
set(CCOPYKERNEL zcopy.S) | |||
set(ZCOPYKERNEL zcopy.S) | |||
set(QCOPYKERNEL copy.S) | |||
set(XCOPYKERNEL zcopy.S) | |||
set(SDOTKERNEL dot.S) | |||
set(DDOTKERNEL dot.S) | |||
set(CDOTKERNEL zdot.S) | |||
set(ZDOTKERNEL zdot.S) | |||
set(QDOTKERNEL dot.S) | |||
set(XDOTKERNEL zdot.S) | |||
set(SNRM2KERNEL nrm2.S) | |||
set(DNRM2KERNEL nrm2.S) | |||
set(QNRM2KERNEL nrm2.S) | |||
set(CNRM2KERNEL znrm2.S) | |||
set(ZNRM2KERNEL znrm2.S) | |||
set(XNRM2KERNEL znrm2.S) | |||
set(SROTKERNEL rot.S) | |||
set(DROTKERNEL rot.S) | |||
set(QROTKERNEL rot.S) | |||
set(CROTKERNEL zrot.S) | |||
set(ZROTKERNEL zrot.S) | |||
set(XROTKERNEL zrot.S) | |||
set(SSCALKERNEL scal.S) | |||
set(DSCALKERNEL scal.S) | |||
set(CSCALKERNEL zscal.S) | |||
set(ZSCALKERNEL zscal.S) | |||
set(QSCALKERNEL scal.S) | |||
set(XSCALKERNEL zscal.S) | |||
set(SSWAPKERNEL swap.S) | |||
set(DSWAPKERNEL swap.S) | |||
set(CSWAPKERNEL zswap.S) | |||
set(ZSWAPKERNEL zswap.S) | |||
set(QSWAPKERNEL swap.S) | |||
set(XSWAPKERNEL zswap.S) | |||
set(SGEMVNKERNEL gemv_n.S) | |||
set(SGEMVTKERNEL gemv_t.S) | |||
set(DGEMVNKERNEL gemv_n.S) | |||
set(DGEMVTKERNEL gemv_t.S) | |||
set(CGEMVNKERNEL zgemv_n.S) | |||
set(CGEMVTKERNEL zgemv_t.S) | |||
set(ZGEMVNKERNEL zgemv_n.S) | |||
set(ZGEMVTKERNEL zgemv_t.S) | |||
set(QGEMVNKERNEL gemv_n.S) | |||
set(QGEMVTKERNEL gemv_t.S) | |||
set(XGEMVNKERNEL zgemv_n.S) | |||
set(XGEMVTKERNEL zgemv_t.S) | |||
set(SCABS_KERNEL ../generic/cabs.c) | |||
set(DCABS_KERNEL ../generic/cabs.c) | |||
set(QCABS_KERNEL ../generic/cabs.c) | |||
set(LSAME_KERNEL ../generic/lsame.c) | |||
set(SAXPBYKERNEL ../arm/axpby.c) | |||
set(DAXPBYKERNEL ../arm/axpby.c) | |||
set(CAXPBYKERNEL ../arm/zaxpby.c) | |||
set(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
set(SSUMKERNEL sum.S) | |||
set(DSUMKERNEL sum.S) | |||
set(CSUMKERNEL zsum.S) | |||
set(ZSUMKERNEL zsum.S) | |||
set(QSUMKERNEL sum.S) | |||
set(XSUMKERNEL zsum.S) | |||
SetFallback(SAMAXKERNEL amax.S) | |||
SetFallback(DAMAXKERNEL amax.S) | |||
SetFallback(QAMAXKERNEL amax.S) | |||
SetFallback(CAMAXKERNEL zamax.S) | |||
SetFallback(ZAMAXKERNEL zamax.S) | |||
SetFallback(XAMAXKERNEL zamax.S) | |||
SetFallback(SAMINKERNEL amin.S) | |||
SetFallback(DAMINKERNEL amin.S) | |||
SetFallback(QAMINKERNEL amin.S) | |||
SetFallback(CAMINKERNEL zamin.S) | |||
SetFallback(ZAMINKERNEL zamin.S) | |||
SetFallback(XAMINKERNEL zamin.S) | |||
SetFallback(SMAXKERNEL max.S) | |||
SetFallback(DMAXKERNEL max.S) | |||
SetFallback(QMAXKERNEL max.S) | |||
SetFallback(SMINKERNEL min.S) | |||
SetFallback(DMINKERNEL min.S) | |||
SetFallback(QMINKERNEL min.S) | |||
SetFallback(ISAMAXKERNEL iamax.S) | |||
SetFallback(IDAMAXKERNEL iamax.S) | |||
SetFallback(IQAMAXKERNEL iamax.S) | |||
SetFallback(ICAMAXKERNEL izamax.S) | |||
SetFallback(IZAMAXKERNEL izamax.S) | |||
SetFallback(IXAMAXKERNEL izamax.S) | |||
SetFallback(ISAMINKERNEL iamin.S) | |||
SetFallback(IDAMINKERNEL iamin.S) | |||
SetFallback(IQAMINKERNEL iamin.S) | |||
SetFallback(ICAMINKERNEL izamin.S) | |||
SetFallback(IZAMINKERNEL izamin.S) | |||
SetFallback(IXAMINKERNEL izamin.S) | |||
SetFallback(ISMAXKERNEL iamax.S) | |||
SetFallback(IDMAXKERNEL iamax.S) | |||
SetFallback(IQMAXKERNEL iamax.S) | |||
SetFallback(ISMINKERNEL iamin.S) | |||
SetFallback(IDMINKERNEL iamin.S) | |||
SetFallback(IQMINKERNEL iamin.S) | |||
SetFallback(SASUMKERNEL asum.S) | |||
SetFallback(DASUMKERNEL asum.S) | |||
SetFallback(CASUMKERNEL zasum.S) | |||
SetFallback(ZASUMKERNEL zasum.S) | |||
SetFallback(QASUMKERNEL asum.S) | |||
SetFallback(XASUMKERNEL zasum.S) | |||
SetFallback(SAXPYKERNEL axpy.S) | |||
SetFallback(DAXPYKERNEL axpy.S) | |||
SetFallback(CAXPYKERNEL zaxpy.S) | |||
SetFallback(ZAXPYKERNEL zaxpy.S) | |||
SetFallback(QAXPYKERNEL axpy.S) | |||
SetFallback(XAXPYKERNEL zaxpy.S) | |||
SetFallback(SCOPYKERNEL copy.S) | |||
SetFallback(DCOPYKERNEL copy.S) | |||
SetFallback(CCOPYKERNEL zcopy.S) | |||
SetFallback(ZCOPYKERNEL zcopy.S) | |||
SetFallback(QCOPYKERNEL copy.S) | |||
SetFallback(XCOPYKERNEL zcopy.S) | |||
SetFallback(SDOTKERNEL dot.S) | |||
SetFallback(DDOTKERNEL dot.S) | |||
SetFallback(CDOTKERNEL zdot.S) | |||
SetFallback(ZDOTKERNEL zdot.S) | |||
SetFallback(QDOTKERNEL dot.S) | |||
SetFallback(XDOTKERNEL zdot.S) | |||
SetFallback(SNRM2KERNEL nrm2.S) | |||
SetFallback(DNRM2KERNEL nrm2.S) | |||
SetFallback(QNRM2KERNEL nrm2.S) | |||
SetFallback(CNRM2KERNEL znrm2.S) | |||
SetFallback(ZNRM2KERNEL znrm2.S) | |||
SetFallback(XNRM2KERNEL znrm2.S) | |||
SetFallback(SROTKERNEL rot.S) | |||
SetFallback(DROTKERNEL rot.S) | |||
SetFallback(QROTKERNEL rot.S) | |||
SetFallback(CROTKERNEL zrot.S) | |||
SetFallback(ZROTKERNEL zrot.S) | |||
SetFallback(XROTKERNEL zrot.S) | |||
SetFallback(SSCALKERNEL scal.S) | |||
SetFallback(DSCALKERNEL scal.S) | |||
SetFallback(CSCALKERNEL zscal.S) | |||
SetFallback(ZSCALKERNEL zscal.S) | |||
SetFallback(QSCALKERNEL scal.S) | |||
SetFallback(XSCALKERNEL zscal.S) | |||
SetFallback(SSWAPKERNEL swap.S) | |||
SetFallback(DSWAPKERNEL swap.S) | |||
SetFallback(CSWAPKERNEL zswap.S) | |||
SetFallback(ZSWAPKERNEL zswap.S) | |||
SetFallback(QSWAPKERNEL swap.S) | |||
SetFallback(XSWAPKERNEL zswap.S) | |||
SetFallback(SGEMVNKERNEL gemv_n.S) | |||
SetFallback(SGEMVTKERNEL gemv_t.S) | |||
SetFallback(DGEMVNKERNEL gemv_n.S) | |||
SetFallback(DGEMVTKERNEL gemv_t.S) | |||
SetFallback(CGEMVNKERNEL zgemv_n.S) | |||
SetFallback(CGEMVTKERNEL zgemv_t.S) | |||
SetFallback(ZGEMVNKERNEL zgemv_n.S) | |||
SetFallback(ZGEMVTKERNEL zgemv_t.S) | |||
SetFallback(QGEMVNKERNEL gemv_n.S) | |||
SetFallback(QGEMVTKERNEL gemv_t.S) | |||
SetFallback(XGEMVNKERNEL zgemv_n.S) | |||
SetFallback(XGEMVTKERNEL zgemv_t.S) | |||
SetFallback(SCABS_KERNEL ../generic/cabs.c) | |||
SetFallback(DCABS_KERNEL ../generic/cabs.c) | |||
SetFallback(QCABS_KERNEL ../generic/cabs.c) | |||
SetFallback(LSAME_KERNEL ../generic/lsame.c) | |||
SetFallback(SAXPBYKERNEL ../arm/axpby.c) | |||
SetFallback(DAXPBYKERNEL ../arm/axpby.c) | |||
SetFallback(CAXPBYKERNEL ../arm/zaxpby.c) | |||
SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
SetFallback(SSUMKERNEL sum.S) | |||
SetFallback(DSUMKERNEL sum.S) | |||
SetFallback(CSUMKERNEL zsum.S) | |||
SetFallback(ZSUMKERNEL zsum.S) | |||
SetFallback(QSUMKERNEL sum.S) | |||
SetFallback(XSUMKERNEL zsum.S) | |||
if (BUILD_BFLOAT16) | |||
set(SHAMINKERNEL ../arm/amin.c) | |||
set(SHAMAXKERNEL ../arm/amax.c) | |||
set(SHMAXKERNEL ../arm/max.c) | |||
set(SHMINKERNEL ../arm/min.c) | |||
set(ISHAMAXKERNEL ../arm/iamax.c) | |||
set(ISHAMINKERNEL ../arm/iamin.c) | |||
set(ISHMAXKERNEL ../arm/imax.c) | |||
set(ISHMINKERNEL ../arm/imin.c) | |||
set(SHASUMKERNEL ../arm/asum.c) | |||
set(SHAXPYKERNEL ../arm/axpy.c) | |||
set(SHAXPBYKERNEL ../arm/axpby.c) | |||
set(SHCOPYKERNEL ../arm/copy.c) | |||
set(SBDOTKERNEL ../x86_64/sbdot.c) | |||
set(SHROTKERNEL ../arm/rot.c) | |||
set(SHSCALKERNEL ../arm/scal.c) | |||
set(SHNRM2KERNEL ../arm/nrm2.c) | |||
set(SHSUMKERNEL ../arm/sum.c) | |||
set(SHSWAPKERNEL ../arm/swap.c) | |||
set(TOBF16KERNEL ../x86_64/tobf16.c) | |||
set(BF16TOKERNEL ../x86_64/bf16to.c) | |||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
SetFallback(SHAMINKERNEL ../arm/amin.c) | |||
SetFallback(SHAMAXKERNEL ../arm/amax.c) | |||
SetFallback(SHMAXKERNEL ../arm/max.c) | |||
SetFallback(SHMINKERNEL ../arm/min.c) | |||
SetFallback(ISHAMAXKERNEL ../arm/iamax.c) | |||
SetFallback(ISHAMINKERNEL ../arm/iamin.c) | |||
SetFallback(ISHMAXKERNEL ../arm/imax.c) | |||
SetFallback(ISHMINKERNEL ../arm/imin.c) | |||
SetFallback(SHASUMKERNEL ../arm/asum.c) | |||
SetFallback(SHAXPYKERNEL ../arm/axpy.c) | |||
SetFallback(SHAXPBYKERNEL ../arm/axpby.c) | |||
SetFallback(SHCOPYKERNEL ../arm/copy.c) | |||
SetFallback(SBDOTKERNEL ../x86_64/sbdot.c) | |||
SetFallback(SHROTKERNEL ../arm/rot.c) | |||
SetFallback(SHSCALKERNEL ../arm/scal.c) | |||
SetFallback(SHNRM2KERNEL ../arm/nrm2.c) | |||
SetFallback(SHSUMKERNEL ../arm/sum.c) | |||
SetFallback(SHSWAPKERNEL ../arm/swap.c) | |||
SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) | |||
SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) | |||
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
endif () | |||
endmacro () | |||
macro(SetDefaultL2) | |||
set(SGEMVNKERNEL ../arm/gemv_n.c) | |||
set(SGEMVTKERNEL ../arm/gemv_t.c) | |||
set(DGEMVNKERNEL gemv_n.S) | |||
set(DGEMVTKERNEL gemv_t.S) | |||
set(CGEMVNKERNEL zgemv_n.S) | |||
set(CGEMVTKERNEL zgemv_t.S) | |||
set(ZGEMVNKERNEL zgemv_n.S) | |||
set(ZGEMVTKERNEL zgemv_t.S) | |||
set(QGEMVNKERNEL gemv_n.S) | |||
set(QGEMVTKERNEL gemv_t.S) | |||
set(XGEMVNKERNEL zgemv_n.S) | |||
set(XGEMVTKERNEL zgemv_t.S) | |||
set(SGERKERNEL ../generic/ger.c) | |||
set(DGERKERNEL ../generic/ger.c) | |||
set(QGERKERNEL ../generic/ger.c) | |||
set(CGERUKERNEL ../generic/zger.c) | |||
set(CGERCKERNEL ../generic/zger.c) | |||
set(ZGERUKERNEL ../generic/zger.c) | |||
set(ZGERCKERNEL ../generic/zger.c) | |||
set(XGERUKERNEL ../generic/zger.c) | |||
set(XGERCKERNEL ../generic/zger.c) | |||
set(SSYMV_U_KERNEL ../generic/symv_k.c) | |||
set(SSYMV_L_KERNEL ../generic/symv_k.c) | |||
set(DSYMV_U_KERNEL ../generic/symv_k.c) | |||
set(DSYMV_L_KERNEL ../generic/symv_k.c) | |||
set(QSYMV_U_KERNEL ../generic/symv_k.c) | |||
set(QSYMV_L_KERNEL ../generic/symv_k.c) | |||
set(CSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
set(CSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
set(XSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
set(XSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
set(CHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
set(CHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
set(CHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
set(CHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
set(XHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(SGEMVNKERNEL ../arm/gemv_n.c) | |||
SetFallback(SGEMVTKERNEL ../arm/gemv_t.c) | |||
SetFallback(DGEMVNKERNEL gemv_n.S) | |||
SetFallback(DGEMVTKERNEL gemv_t.S) | |||
SetFallback(CGEMVNKERNEL zgemv_n.S) | |||
SetFallback(CGEMVTKERNEL zgemv_t.S) | |||
SetFallback(ZGEMVNKERNEL zgemv_n.S) | |||
SetFallback(ZGEMVTKERNEL zgemv_t.S) | |||
SetFallback(QGEMVNKERNEL gemv_n.S) | |||
SetFallback(QGEMVTKERNEL gemv_t.S) | |||
SetFallback(XGEMVNKERNEL zgemv_n.S) | |||
SetFallback(XGEMVTKERNEL zgemv_t.S) | |||
SetFallback(SGERKERNEL ../generic/ger.c) | |||
SetFallback(DGERKERNEL ../generic/ger.c) | |||
SetFallback(QGERKERNEL ../generic/ger.c) | |||
SetFallback(CGERUKERNEL ../generic/zger.c) | |||
SetFallback(CGERCKERNEL ../generic/zger.c) | |||
SetFallback(ZGERUKERNEL ../generic/zger.c) | |||
SetFallback(ZGERCKERNEL ../generic/zger.c) | |||
SetFallback(XGERUKERNEL ../generic/zger.c) | |||
SetFallback(XGERCKERNEL ../generic/zger.c) | |||
SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c) | |||
SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c) | |||
SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c) | |||
SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c) | |||
SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c) | |||
SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c) | |||
SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
if (BUILD_BFLOAT16) | |||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
set(SHGERKERNEL ../generic/ger.c) | |||
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
SetFallback(SHGERKERNEL ../generic/ger.c) | |||
endif () | |||
endmacro () | |||
macro(SetDefaultL3) | |||
set(SGEADD_KERNEL ../generic/geadd.c) | |||
set(DGEADD_KERNEL ../generic/geadd.c) | |||
set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
SetFallback(SGEADD_KERNEL ../generic/geadd.c) | |||
SetFallback(DGEADD_KERNEL ../generic/geadd.c) | |||
SetFallback(CGEADD_KERNEL ../generic/zgeadd.c) | |||
SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
if (BUILD_BFLOAT16) | |||
set(SHGEADD_KERNEL ../generic/geadd.c) | |||
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
set(SBGEMM_BETA ../generic/gemm_beta.c) | |||
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
set(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
SetFallback(SHGEADD_KERNEL ../generic/geadd.c) | |||
SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) | |||
SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
endif () | |||
endmacro () |
@@ -416,7 +416,7 @@ endif () | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "VORTEX") | |||
elseif ("${TCORE}" STREQUAL "VORTEX") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define ARMV8\n" | |||
"#define L1_CODE_SIZE\t32768\n" | |||
@@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX") | |||
set(ZGEMM_UNROLL_M 4) | |||
set(ZGEMM_UNROLL_N 4) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "P5600") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L2_SIZE 1048576\n" | |||
"#define DTB_SIZE 4096\n" | |||
"#define DTB_DEFAULT_ENTRIES 64\n") | |||
set(SGEMM_UNROLL_M 2) | |||
set(SGEMM_UNROLL_N 2) | |||
set(DGEMM_UNROLL_M 2) | |||
set(DGEMM_UNROLL_N 2) | |||
set(CGEMM_UNROLL_M 2) | |||
set(CGEMM_UNROLL_N 2) | |||
set(ZGEMM_UNROLL_M 2) | |||
set(ZGEMM_UNROLL_N 2) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" MATCHES "MIPS") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L2_SIZE 262144\n" | |||
"#define DTB_SIZE 4096\n" | |||
"#define DTB_DEFAULT_ENTRIES 64\n") | |||
set(SGEMM_UNROLL_M 2) | |||
set(SGEMM_UNROLL_N 2) | |||
set(DGEMM_UNROLL_M 2) | |||
set(DGEMM_UNROLL_N 2) | |||
set(CGEMM_UNROLL_M 2) | |||
set(CGEMM_UNROLL_N 2) | |||
set(ZGEMM_UNROLL_M 2) | |||
set(ZGEMM_UNROLL_N 2) | |||
set(SYMV_P 16) | |||
elseif ("${TCORE}" STREQUAL "POWER6") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define L1_DATA_SIZE 32768\n" | |||
@@ -33,7 +33,7 @@ endif () | |||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
set(NO_AVX 1) | |||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") | |||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS") | |||
set(TARGET "NEHALEM") | |||
endif () | |||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
@@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") | |||
set(TARGET "ARMV7") | |||
endif () | |||
if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10") | |||
set(TARGET "POWER6") | |||
endif () | |||
endif () | |||
@@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) | |||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") | |||
endif () | |||
if (POWER) | |||
set(NO_WARMUP 1) | |||
set(HAVE_GAS 1) | |||
if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU") | |||
set(HAVE_GAS 0) | |||
elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as") | |||
set(HAVE_GAS 0) | |||
endif () | |||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") | |||
endif () | |||
#if don't use Fortran, it will only compile CBLAS. | |||
if (ONLY_CBLAS) | |||
set(NO_LAPACK 1) | |||
@@ -163,6 +178,22 @@ if (DEFINED TARGET) | |||
endif() | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) | |||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") | |||
else() | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
endif() | |||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") | |||
else() | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
endif() | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
endif() | |||
@@ -206,6 +237,27 @@ if (DEFINED TARGET) | |||
if (DEFINED HAVE_SSE4_1) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") | |||
endif() | |||
if (${TARGET} STREQUAL POWER10) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | |||
else () | |||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL POWER9) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") | |||
else () | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") | |||
endif() | |||
endif() | |||
if (${TARGET} STREQUAL POWER8) | |||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
endif() | |||
endif() | |||
if (DEFINED BINARY) | |||
message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
@@ -223,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") | |||
# C Compiler dependent settings | |||
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") | |||
if (INTERFACE64) | |||
set(SUFFIX64 64) | |||
set(SUFFIX64_UNDERSCORE _64) | |||
endif() | |||
if (NOT NOFORTRAN) | |||
# Fortran Compiler dependent settings | |||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | |||
@@ -258,7 +315,7 @@ if (NEED_PIC) | |||
endif() | |||
endif () | |||
if (X86_64) | |||
if (X86_64 OR ${CORE} STREQUAL POWER10) | |||
set(SMALL_MATRIX_OPT TRUE) | |||
endif () | |||
if (SMALL_MATRIX_OPT) | |||
@@ -266,7 +323,7 @@ if (SMALL_MATRIX_OPT) | |||
endif () | |||
if (DYNAMIC_ARCH) | |||
if (X86 OR X86_64 OR ARM64 OR PPC) | |||
if (X86 OR X86_64 OR ARM64 OR POWER) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
if (DYNAMIC_OLDER) | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
@@ -20,11 +20,11 @@ endif() | |||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32) | |||
if(MINGW) | |||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine | |||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE | |||
OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE | |||
OUTPUT_STRIP_TRAILING_WHITESPACE) | |||
if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") | |||
if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") | |||
set(MINGW64 1) | |||
endif() | |||
endif() | |||
@@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64) | |||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) | |||
set(X86 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
set(PPC 1) | |||
set(POWER 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
set(MIPS64 1) | |||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||
@@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING}) | |||
else () | |||
set(X86 1) | |||
endif() | |||
elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*") | |||
set(MIPS32 1) | |||
elseif (${TARGET} STREQUAL "ARMV7") | |||
set(ARM 1) | |||
else() | |||
@@ -86,8 +88,12 @@ if (X86_64) | |||
set(ARCH "x86_64") | |||
elseif(X86) | |||
set(ARCH "x86") | |||
elseif(PPC) | |||
elseif(POWER) | |||
set(ARCH "power") | |||
elseif(MIPS32) | |||
set(ARCH "mips") | |||
elseif(MIPS64) | |||
set(ARCH "mips64") | |||
elseif(ARM) | |||
set(ARCH "arm") | |||
elseif(ARM64) | |||
@@ -97,7 +103,7 @@ else() | |||
endif () | |||
if (NOT BINARY) | |||
if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) | |||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) | |||
set(BINARY 64) | |||
else () | |||
set(BINARY 32) | |||
@@ -15,35 +15,83 @@ endfunction () | |||
# Reads a Makefile into CMake vars. | |||
macro(ParseMakefileVars MAKEFILE_IN) | |||
message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
set (IfElse 0) | |||
set (ElseSeen 0) | |||
set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | |||
set (IfElse 0) | |||
set (ElseSeen 0) | |||
set (SkipIfs 0) | |||
set (SkipElse 0) | |||
file(STRINGS ${MAKEFILE_IN} makefile_contents) | |||
foreach (makefile_line ${makefile_contents}) | |||
#message(STATUS "parsing ${makefile_line}") | |||
#message(STATUS "parsing ${makefile_line}") | |||
# Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition. | |||
# The variable SkipIfs is used to identify which endif statement closes the scope of the else statement. | |||
if (${SkipElse} EQUAL 1) | |||
#message(STATUS "skipping ${makefile_line}") | |||
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
MATH(EXPR SkipIfs "${SkipIfs}+1") | |||
endif () | |||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
if (${SkipIfs} EQUAL 0) | |||
set (SkipElse 0) | |||
else () | |||
MATH(EXPR SkipIfs "${SkipIfs}-1") | |||
endif () | |||
endif () | |||
continue () | |||
endif () | |||
# The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement. | |||
if (${IfElse} GREATER 0) | |||
# If the current scope is the one that has to be skipped, the if/endif/else statements | |||
# along with it till the endif that closes the current scope have to be ignored as well. | |||
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
#message(STATUS "skipping ${makefile_line}") | |||
MATH(EXPR SkipIfs "${SkipIfs}+1") | |||
continue () | |||
endif () | |||
endif () | |||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
# message(STATUS "ENDIF ${makefile_line}") | |||
set (IfElse 0) | |||
set (ElseSeen 0) | |||
if (${SkipIfs} EQUAL 0) | |||
#message(STATUS "ENDIF ${makefile_line}") | |||
set (IfElse 0) | |||
set (ElseSeen 0) | |||
else () | |||
#message(STATUS "skipping ${makefile_line}") | |||
MATH(EXPR SkipIfs "${SkipIfs}-1") | |||
endif () | |||
continue () | |||
endif () | |||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
# message(STATUS "ELSE ${makefile_line}") | |||
set (ElseSeen 1) | |||
continue () | |||
endif() | |||
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
# message(STATUS "skipping ${makefile_line}") | |||
continue () | |||
if (NOT "${line_match}" STREQUAL "") | |||
if (${SkipIfs} EQUAL 0) | |||
#message(STATUS "ELSE ${makefile_line}") | |||
set (ElseSeen 1) | |||
else () | |||
#message(STATUS "skipping ${makefile_line}") | |||
endif () | |||
continue () | |||
endif() | |||
# Skip the lines that are not part of the path that has to be taken. | |||
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0)) | |||
#message(STATUS "skipping ${makefile_line}") | |||
continue () | |||
endif () | |||
endif () | |||
endif () | |||
# Skip commented lines (the ones that start with '#') | |||
string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "skipping ${makefile_line}") | |||
continue () | |||
endif () | |||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "match on ${line_match}") | |||
#message(STATUS "match on ${line_match}") | |||
set(var_name ${CMAKE_MATCH_1}) | |||
# set(var_value ${CMAKE_MATCH_2}) | |||
#set(var_value ${CMAKE_MATCH_2}) | |||
string(STRIP ${CMAKE_MATCH_2} var_value) | |||
# check for Makefile variables in the string, e.g. $(TSUFFIX) | |||
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | |||
@@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) | |||
endforeach () | |||
set(${var_name} ${var_value}) | |||
else () | |||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "match on include ${line_match}") | |||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
continue () | |||
endif () | |||
# Include a new file to be parsed | |||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "match on include ${line_match}") | |||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
continue () | |||
endif () | |||
# The if statement that precedes this else has the path taken | |||
# Thus, this else statement has to be skipped. | |||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "skipping ${makefile_line}") | |||
set (SkipElse 1) | |||
continue() | |||
endif() | |||
# Example 1: ifdef HAVE_MSA | |||
# Example 2: ifndef ZNRM2KERNEL | |||
string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") | |||
set (ElseSeen 0) | |||
if (DEFINED ${CMAKE_MATCH_2}) | |||
if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
#message (STATUS "condition is true") | |||
set (IfElse 1) | |||
else () | |||
set (IfElse 2) | |||
endif () | |||
else () | |||
# message(STATUS "unmatched line ${line_match}") | |||
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||
# message (STATUS "condition is true") | |||
set (IfElse 1) | |||
else () | |||
set (IfElse 2) | |||
endif () | |||
if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
set (IfElse 2) | |||
else () | |||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) | |||
set (CMAKE_MATCH_1 CMAKE_C_COMPILER) | |||
endif () | |||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
# message (STATUS "condition is true") | |||
set (IfElse 1) | |||
else () | |||
set (IfElse 2) | |||
endif () | |||
endif () | |||
#message (STATUS "condition is true") | |||
set (IfElse 1) | |||
endif () | |||
endif () | |||
continue () | |||
endif () | |||
# Example 1: ifeq ($(SGEMM_UNROLL_M), 16) | |||
# Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||
# Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) | |||
# Ignore the second group since (?:...) does not work on cmake | |||
string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}") | |||
if (DEFINED ${CMAKE_MATCH_1}) | |||
if (DEFINED ${CMAKE_MATCH_4}) | |||
set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}}) | |||
else () | |||
set (STR ${${CMAKE_MATCH_1}}) | |||
endif () | |||
if (${STR} STREQUAL ${CMAKE_MATCH_5}) | |||
#message (STATUS "condition is true") | |||
set (IfElse 1) | |||
continue () | |||
endif () | |||
endif () | |||
set (IfElse 2) | |||
continue () | |||
endif () | |||
# Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
# Example 2 (Group 4): ifneq ($(C_COMPILER), PGI) | |||
string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}") | |||
if (NOT "${line_match}" STREQUAL "") | |||
#message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}") | |||
set (ElseSeen 0) | |||
set (HasValidGroup 0) | |||
if (DEFINED ${CMAKE_MATCH_3}) | |||
set (HasValidGroup 1) | |||
set (STR ${${CMAKE_MATCH_3}}) | |||
elseif (NOT ${CMAKE_MATCH_4} STREQUAL "") | |||
set (HasValidGroup 1) | |||
set (STR ${CMAKE_MATCH_4}) | |||
endif () | |||
if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) | |||
if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) | |||
#message (STATUS "condition is true") | |||
set (IfElse 1) | |||
continue () | |||
endif () | |||
endif () | |||
set (IfElse 2) | |||
continue () | |||
endif () | |||
#message(STATUS "unmatched line ${line_match}") | |||
endforeach () | |||
endmacro () | |||
@@ -1,13 +1,14 @@ | |||
include ../Makefile.rule | |||
TOPDIR = .. | |||
include $(TOPDIR)/Makefile.system | |||
all :: dgemv_tester dgemm_tester | |||
dgemv_tester : | |||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester | |||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester | |||
./dgemv_tester | |||
dgemm_tester : dgemv_tester | |||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester | |||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester | |||
./dgemm_tester | |||
clean :: | |||
@@ -120,6 +120,7 @@ | |||
#define CORE_SKYLAKEX 28 | |||
#define CORE_DHYANA 29 | |||
#define CORE_COOPERLAKE 30 | |||
#define CORE_SAPPHIRERAPIDS 31 | |||
#define HAVE_SSE (1 << 0) | |||
#define HAVE_SSE2 (1 << 1) | |||
@@ -145,6 +146,7 @@ | |||
#define HAVE_AVX512VL (1 << 21) | |||
#define HAVE_AVX2 (1 << 22) | |||
#define HAVE_AVX512BF16 (1 << 23) | |||
#define HAVE_AMXBF16 (1 << 24) | |||
#define CACHE_INFO_L1_I 1 | |||
#define CACHE_INFO_L1_D 2 | |||
@@ -222,6 +224,7 @@ typedef struct { | |||
#define CPUTYPE_SKYLAKEX 52 | |||
#define CPUTYPE_DHYANA 53 | |||
#define CPUTYPE_COOPERLAKE 54 | |||
#define CPUTYPE_SAPPHIRERAPIDS 55 | |||
#define CPUTYPE_HYGON_UNKNOWN 99 | |||
@@ -26,10 +26,12 @@ | |||
*****************************************************************************/ | |||
#include <string.h> | |||
#ifdef OS_DARWIN | |||
#ifdef __APPLE__ | |||
#include <sys/sysctl.h> | |||
int32_t value; | |||
size_t length=sizeof(value); | |||
int64_t value64; | |||
size_t length64=sizeof(value64); | |||
#endif | |||
#define CPU_UNKNOWN 0 | |||
@@ -53,6 +55,8 @@ size_t length=sizeof(value); | |||
#define CPU_EMAG8180 10 | |||
// Apple | |||
#define CPU_VORTEX 13 | |||
// Fujitsu | |||
#define CPU_A64FX 15 | |||
static char *cpuname[] = { | |||
"UNKNOWN", | |||
@@ -69,7 +73,8 @@ static char *cpuname[] = { | |||
"NEOVERSEN1", | |||
"THUNDERX3T110", | |||
"VORTEX", | |||
"CORTEXA55" | |||
"CORTEXA55", | |||
"A64FX" | |||
}; | |||
static char *cpuname_lower[] = { | |||
@@ -87,7 +92,8 @@ static char *cpuname_lower[] = { | |||
"neoversen1", | |||
"thunderx3t110", | |||
"vortex", | |||
"cortexa55" | |||
"cortexa55", | |||
"a64fx" | |||
}; | |||
int get_feature(char *search) | |||
@@ -183,6 +189,9 @@ int detect(void) | |||
// Ampere | |||
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) | |||
return CPU_EMAG8180; | |||
// Fujitsu | |||
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) | |||
return CPU_A64FX; | |||
} | |||
p = (char *) NULL ; | |||
@@ -212,9 +221,9 @@ int detect(void) | |||
} | |||
#else | |||
#ifdef DARWIN | |||
#ifdef __APPLE__ | |||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0); | |||
if (value ==131287967) return CPU_VORTEX; | |||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; | |||
#endif | |||
return CPU_ARMV8; | |||
#endif | |||
@@ -265,7 +274,7 @@ int n=0; | |||
printf("#define NUM_CORES %d\n",n); | |||
#endif | |||
#ifdef DARWIN | |||
#ifdef __APPLE__ | |||
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
printf("#define NUM_CORES %d\n",value); | |||
#endif | |||
@@ -285,154 +294,166 @@ void get_cpuconfig(void) | |||
switch (d) | |||
{ | |||
case CPU_CORTEXA53: | |||
case CPU_CORTEXA55: | |||
printf("#define %s\n", cpuname[d]); | |||
// Fall-through | |||
case CPU_ARMV8: | |||
// Minimum parameters for ARMv8 (based on A53) | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L2_SIZE 262144\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 4\n"); | |||
case CPU_CORTEXA53: | |||
case CPU_CORTEXA55: | |||
printf("#define %s\n", cpuname[d]); | |||
// Fall-through | |||
case CPU_ARMV8: | |||
// Minimum parameters for ARMv8 (based on A53) | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L2_SIZE 262144\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 4\n"); | |||
break; | |||
case CPU_CORTEXA57: | |||
case CPU_CORTEXA72: | |||
case CPU_CORTEXA73: | |||
case CPU_CORTEXA57: | |||
case CPU_CORTEXA72: | |||
case CPU_CORTEXA73: | |||
// Common minimum settings for these Arm cores | |||
// Can change a lot, but we need to be conservative | |||
// TODO: detect info from /sys if possible | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 49152\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
printf("#define L2_SIZE 524288\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_NEOVERSEN1: | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
printf("#define L1_DATA_SIZE 65536\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
printf("#define L2_SIZE 1048576\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_FALKOR: | |||
printf("#define FALKOR\n"); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 128\n"); | |||
printf("#define L2_SIZE 524288\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
break; | |||
case CPU_THUNDERX: | |||
printf("#define THUNDERX\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 128\n"); | |||
printf("#define L2_SIZE 16777216\n"); | |||
printf("#define L2_LINESIZE 128\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
break; | |||
case CPU_THUNDERX2T99: | |||
printf("#define THUNDERX2T99 \n"); | |||
printf("#define L1_CODE_SIZE 32768 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
printf("#define L1_DATA_SIZE 32768 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
printf("#define L2_SIZE 262144 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define L3_SIZE 33554432 \n"); | |||
printf("#define L3_LINESIZE 64 \n"); | |||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 49152\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
printf("#define L2_SIZE 524288\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_NEOVERSEN1: | |||
printf("#define %s\n", cpuname[d]); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
printf("#define L1_DATA_SIZE 65536\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
printf("#define L2_SIZE 1048576\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_FALKOR: | |||
printf("#define FALKOR\n"); | |||
printf("#define L1_CODE_SIZE 65536\n"); | |||
printf("#define L1_CODE_LINESIZE 64\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 128\n"); | |||
printf("#define L2_SIZE 524288\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
break; | |||
case CPU_THUNDERX: | |||
printf("#define THUNDERX\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 128\n"); | |||
printf("#define L2_SIZE 16777216\n"); | |||
printf("#define L2_LINESIZE 128\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
break; | |||
case CPU_THUNDERX2T99: | |||
printf("#define THUNDERX2T99 \n"); | |||
printf("#define L1_CODE_SIZE 32768 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
printf("#define L1_DATA_SIZE 32768 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
printf("#define L2_SIZE 262144 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define L3_SIZE 33554432 \n"); | |||
printf("#define L3_LINESIZE 64 \n"); | |||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
case CPU_TSV110: | |||
printf("#define TSV110 \n"); | |||
printf("#define L1_CODE_SIZE 65536 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
printf("#define L1_DATA_SIZE 65536 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
printf("#define L2_SIZE 524228 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
case CPU_EMAG8180: | |||
// Minimum parameters for ARMv8 (based on A53) | |||
printf("#define EMAG8180\n"); | |||
printf("#define L1_CODE_SIZE 32768\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L2_SIZE 262144\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_THUNDERX3T110: | |||
printf("#define THUNDERX3T110 \n"); | |||
printf("#define L1_CODE_SIZE 65536 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
printf("#define L1_DATA_SIZE 32768 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
printf("#define L2_SIZE 524288 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define L3_SIZE 94371840 \n"); | |||
printf("#define L3_LINESIZE 64 \n"); | |||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
#ifdef DARWIN | |||
case CPU_VORTEX: | |||
printf("#define VORTEX \n"); | |||
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); | |||
printf("#define L1_CODE_SIZE %d \n",value); | |||
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); | |||
printf("#define L1_CODE_LINESIZE %d \n",value); | |||
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); | |||
printf("#define L1_DATA_SIZE %d \n",value); | |||
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); | |||
printf("#define L2_SIZE %d \n",value); | |||
break; | |||
case CPU_TSV110: | |||
printf("#define TSV110 \n"); | |||
printf("#define L1_CODE_SIZE 65536 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
printf("#define L1_DATA_SIZE 65536 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
printf("#define L2_SIZE 524228 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
case CPU_EMAG8180: | |||
// Minimum parameters for ARMv8 (based on A53) | |||
printf("#define EMAG8180\n"); | |||
printf("#define L1_CODE_SIZE 32768\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 64\n"); | |||
printf("#define L2_SIZE 262144\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_THUNDERX3T110: | |||
printf("#define THUNDERX3T110 \n"); | |||
printf("#define L1_CODE_SIZE 65536 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
printf("#define L1_DATA_SIZE 32768 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
printf("#define L2_SIZE 524288 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define L3_SIZE 94371840 \n"); | |||
printf("#define L3_LINESIZE 64 \n"); | |||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
#ifdef __APPLE__ | |||
case CPU_VORTEX: | |||
printf("#define VORTEX \n"); | |||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
printf("#define L1_CODE_SIZE %lld \n",value64); | |||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
printf("#define L1_DATA_SIZE %lld \n",value64); | |||
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
printf("#define L2_SIZE %lld \n",value64); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
#endif | |||
case CPU_A64FX: | |||
printf("#define A64FX\n"); | |||
printf("#define L1_CODE_SIZE 65535\n"); | |||
printf("#define L1_DATA_SIZE 65535\n"); | |||
printf("#define L1_DATA_LINESIZE 256\n"); | |||
printf("#define L2_SIZE 8388608\n"); | |||
printf("#define L2_LINESIZE 256\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
} | |||
get_cpucount(); | |||
} | |||
@@ -165,6 +165,7 @@ void get_cpuconfig(void){ | |||
}else{ | |||
printf("#define UNKNOWN\n"); | |||
} | |||
if (!get_feature(msa)) printf("#define NO_MSA\n"); | |||
} | |||
void get_libname(void){ | |||
@@ -178,3 +179,38 @@ void get_libname(void){ | |||
printf("mips\n"); | |||
} | |||
} | |||
int get_feature(char *search) | |||
{ | |||
#ifdef __linux | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) | |||
{ | |||
if (!strncmp("Features", buffer, 8)) | |||
{ | |||
p = strchr(buffer, ':') + 2; | |||
break; | |||
} | |||
} | |||
fclose(infile); | |||
if( p == NULL ) return 0; | |||
t = strtok(p," "); | |||
while( t = strtok(NULL," ")) | |||
{ | |||
if (!strcmp(t, search)) { return(1); } | |||
} | |||
#endif | |||
return(0); | |||
} | |||
@@ -104,17 +104,17 @@ int detect(void){ | |||
} | |||
} | |||
fclose(infile); | |||
if(p != NULL){ | |||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
return CPU_LOONGSON3R3; | |||
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
return CPU_LOONGSON3R4; | |||
} else{ | |||
return CPU_SICORTEX; | |||
if (p != NULL){ | |||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
return CPU_LOONGSON3R3; | |||
} else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
return CPU_LOONGSON3R4; | |||
} else{ | |||
return CPU_SICORTEX; | |||
} | |||
} | |||
#endif | |||
return CPU_UNKNOWN; | |||
} | |||
} | |||
char *get_corename(void){ | |||
@@ -201,6 +201,7 @@ void get_cpuconfig(void){ | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 8\n"); | |||
} | |||
if (!get_feature(msa)) printf("#define NO_MSA\n"); | |||
} | |||
void get_libname(void){ | |||
@@ -218,3 +219,38 @@ void get_libname(void){ | |||
printf("mips64\n"); | |||
} | |||
} | |||
int get_feature(char *search) | |||
{ | |||
#ifdef __linux | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) | |||
{ | |||
if (!strncmp("Features", buffer, 8)) | |||
{ | |||
p = strchr(buffer, ':') + 2; | |||
break; | |||
} | |||
} | |||
fclose(infile); | |||
if( p == NULL ) return 0; | |||
t = strtok(p," "); | |||
while( t = strtok(NULL," ")) | |||
{ | |||
if (!strcmp(t, search)) { return(1); } | |||
} | |||
#endif | |||
return(0); | |||
} | |||
@@ -1,3 +1,4 @@ | |||
//{ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
@@ -266,6 +267,31 @@ int support_avx512_bf16(){ | |||
#endif | |||
} | |||
#define BIT_AMX_TILE 0x01000000 | |||
#define BIT_AMX_BF16 0x00400000 | |||
#define BIT_AMX_ENBD 0x00060000 | |||
int support_amx_bf16() { | |||
#if !defined(NO_AVX) && !defined(NO_AVX512) | |||
int eax, ebx, ecx, edx; | |||
int ret=0; | |||
if (!support_avx512()) | |||
return 0; | |||
// CPUID.7.0:EDX indicates AMX support | |||
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); | |||
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { | |||
// CPUID.D.0:EAX[17:18] indicates AMX enabled | |||
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | |||
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) | |||
ret = 1; | |||
} | |||
return ret; | |||
#else | |||
return 0; | |||
#endif | |||
} | |||
int get_vendor(void){ | |||
int eax, ebx, ecx, edx; | |||
char vendor[13]; | |||
@@ -353,6 +379,7 @@ int get_cputype(int gettype){ | |||
if (support_avx2()) feature |= HAVE_AVX2; | |||
if (support_avx512()) feature |= HAVE_AVX512VL; | |||
if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; | |||
if (support_amx_bf16()) feature |= HAVE_AMXBF16; | |||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
#endif | |||
@@ -1429,10 +1456,10 @@ int get_cpuname(void){ | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
case 9: | |||
case 8: | |||
switch (model) { | |||
case 12: // Tiger Lake | |||
case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz) | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
@@ -1448,30 +1475,70 @@ int get_cpuname(void){ | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
case 10: //family 6 exmodel 10 | |||
case 15: // Sapphire Rapids | |||
if(support_avx512_bf16()) | |||
return CPUTYPE_COOPERLAKE; | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
case 9: | |||
switch (model) { | |||
case 5: // Comet Lake H and S | |||
case 6: // Comet Lake U | |||
case 7: // Alder Lake desktop | |||
case 10: // Alder Lake mobile | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 7: // Rocket Lake | |||
if(support_avx512()) | |||
return CPUTYPE_NEHALEM; | |||
case 13: // Ice Lake NNPI | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 14: // Kaby Lake and refreshes | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
} | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
case 10: //family 6 exmodel 10 | |||
switch (model) { | |||
case 5: // Comet Lake H and S | |||
case 6: // Comet Lake U | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
case 7: // Rocket Lake | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
} | |||
break; | |||
case 0x7: | |||
return CPUTYPE_ITANIUM; | |||
case 0xf: | |||
@@ -2042,32 +2109,7 @@ int get_coretype(void){ | |||
return CORE_NEHALEM; | |||
} | |||
break; | |||
case 10: | |||
switch (model) { | |||
case 5: // Comet Lake H and S | |||
case 6: // Comet Lake U | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CORE_HASWELL; | |||
#else | |||
return CORE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CORE_NEHALEM; | |||
case 7:// Rocket Lake | |||
#ifndef NO_AVX512 | |||
if(support_avx512()) | |||
return CORE_SKYLAKEX; | |||
#endif | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
#endif | |||
if(support_avx()) | |||
return CORE_SANDYBRIDGE; | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
case 5: | |||
switch (model) { | |||
case 6: | |||
@@ -2121,6 +2163,7 @@ int get_coretype(void){ | |||
return CORE_NEHALEM; | |||
} | |||
break; | |||
case 6: | |||
if (model == 6) | |||
#ifndef NO_AVX512 | |||
@@ -2135,7 +2178,7 @@ int get_coretype(void){ | |||
else | |||
return CORE_NEHALEM; | |||
#endif | |||
if (model == 10) | |||
if (model == 10 || model == 12) | |||
#ifndef NO_AVX512 | |||
if(support_avx512_bf16()) | |||
return CORE_COOPERLAKE; | |||
@@ -2151,10 +2194,11 @@ int get_coretype(void){ | |||
return CORE_NEHALEM; | |||
#endif | |||
break; | |||
case 7: | |||
if (model == 10) | |||
return CORE_NEHALEM; | |||
if (model == 14) | |||
if (model == 13 || model == 14) // Ice Lake | |||
#ifndef NO_AVX512 | |||
return CORE_SKYLAKEX; | |||
#else | |||
@@ -2168,9 +2212,9 @@ int get_coretype(void){ | |||
return CORE_NEHALEM; | |||
#endif | |||
break; | |||
case 9: | |||
case 8: | |||
if (model == 12) { // Tiger Lake | |||
if (model == 12 || model == 13) { // Tiger Lake | |||
if(support_avx512()) | |||
return CORE_SKYLAKEX; | |||
if(support_avx2()) | |||
@@ -2180,7 +2224,7 @@ int get_coretype(void){ | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
if (model == 14) { // Kaby Lake | |||
if (model == 14) { // Kaby Lake mobile | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CORE_HASWELL; | |||
@@ -2190,12 +2234,82 @@ int get_coretype(void){ | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
} | |||
if (model == 15) { // Sapphire Rapids | |||
if(support_avx512_bf16()) | |||
return CPUTYPE_COOPERLAKE; | |||
if(support_avx512()) | |||
return CPUTYPE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CPUTYPE_HASWELL; | |||
if(support_avx()) | |||
return CPUTYPE_SANDYBRIDGE; | |||
else | |||
return CPUTYPE_NEHALEM; | |||
} | |||
break; | |||
case 9: | |||
if (model == 7 || model == 10) { // Alder Lake | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
if(support_avx()) | |||
return CORE_SANDYBRIDGE; | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
if (model == 13) { // Ice Lake NNPI | |||
if(support_avx512()) | |||
return CORE_SKYLAKEX; | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
if(support_avx()) | |||
return CORE_SANDYBRIDGE; | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
if (model == 14) { // Kaby Lake desktop | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CORE_HASWELL; | |||
#else | |||
return CORE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
break; | |||
case 10: | |||
switch (model) { | |||
case 5: // Comet Lake H and S | |||
case 6: // Comet Lake U | |||
if(support_avx()) | |||
#ifndef NO_AVX2 | |||
return CORE_HASWELL; | |||
#else | |||
return CORE_SANDYBRIDGE; | |||
#endif | |||
else | |||
return CORE_NEHALEM; | |||
case 7:// Rocket Lake | |||
#ifndef NO_AVX512 | |||
if(support_avx512()) | |||
return CORE_SKYLAKEX; | |||
#endif | |||
#ifndef NO_AVX2 | |||
if(support_avx2()) | |||
return CORE_HASWELL; | |||
#endif | |||
if(support_avx()) | |||
return CORE_SANDYBRIDGE; | |||
else | |||
return CORE_NEHALEM; | |||
} | |||
case 15: | |||
if (model <= 0x2) return CORE_NORTHWOOD; | |||
else return CORE_PRESCOTT; | |||
} | |||
} | |||
} | |||
@@ -2389,6 +2503,7 @@ void get_cpuconfig(void){ | |||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); | |||
if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n"); | |||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
@@ -2460,9 +2575,11 @@ void get_sse(void){ | |||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); | |||
if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n"); | |||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||
} | |||
//} |
@@ -27,57 +27,11 @@ | |||
#include <string.h> | |||
#define CPU_GENERIC 0 | |||
#define CPU_Z13 1 | |||
#define CPU_Z14 2 | |||
#define CPU_Z15 3 | |||
#include "cpuid_zarch.h" | |||
static char *cpuname[] = { | |||
"ZARCH_GENERIC", | |||
"Z13", | |||
"Z14", | |||
"Z15" | |||
}; | |||
static char *cpuname_lower[] = { | |||
"zarch_generic", | |||
"z13", | |||
"z14", | |||
"z15" | |||
}; | |||
int detect(void) | |||
{ | |||
FILE *infile; | |||
char buffer[512], *p; | |||
p = (char *)NULL; | |||
infile = fopen("/proc/sysinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)){ | |||
if (!strncmp("Type", buffer, 4)){ | |||
p = strchr(buffer, ':') + 2; | |||
#if 0 | |||
fprintf(stderr, "%s\n", p); | |||
#endif | |||
break; | |||
} | |||
} | |||
fclose(infile); | |||
if (strstr(p, "2964")) return CPU_Z13; | |||
if (strstr(p, "2965")) return CPU_Z13; | |||
if (strstr(p, "3906")) return CPU_Z14; | |||
if (strstr(p, "3907")) return CPU_Z14; | |||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 | |||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 | |||
return CPU_GENERIC; | |||
} | |||
void get_libname(void) | |||
{ | |||
int d = detect(); | |||
printf("%s", cpuname_lower[d]); | |||
} | |||
@@ -0,0 +1,101 @@ | |||
#include <stdlib.h> | |||
#define CPU_GENERIC 0 | |||
#define CPU_Z13 1 | |||
#define CPU_Z14 2 | |||
#define CPU_Z15 3 | |||
static char *cpuname[] = { | |||
"ZARCH_GENERIC", | |||
"Z13", | |||
"Z14", | |||
"Z15" | |||
}; | |||
static char *cpuname_lower[] = { | |||
"zarch_generic", | |||
"z13", | |||
"z14", | |||
"z15" | |||
}; | |||
// Guard the use of getauxval() on glibc version >= 2.16 | |||
#ifdef __GLIBC__ | |||
#include <features.h> | |||
#if __GLIBC_PREREQ(2, 16) | |||
#include <sys/auxv.h> | |||
#define HAVE_GETAUXVAL 1 | |||
static unsigned long get_hwcap(void) | |||
{ | |||
unsigned long hwcap = getauxval(AT_HWCAP); | |||
char *maskenv; | |||
// honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
maskenv = getenv("LD_HWCAP_MASK"); | |||
if (maskenv) | |||
hwcap &= strtoul(maskenv, NULL, 0); | |||
return hwcap; | |||
// note that a missing auxval is interpreted as no capabilities | |||
// available, which is safe. | |||
} | |||
#else // __GLIBC_PREREQ(2, 16) | |||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
static unsigned long get_hwcap(void) { | |||
// treat missing support for getauxval() as no capabilities available, | |||
// which is safe. | |||
return 0; | |||
} | |||
#endif // __GLIBC_PREREQ(2, 16) | |||
#endif // __GLIBC | |||
static int detect(void) | |||
{ | |||
unsigned long hwcap = get_hwcap(); | |||
// Choose the architecture level for optimized kernels based on hardware | |||
// capability bits (just like glibc chooses optimized implementations). | |||
// | |||
// The hardware capability bits that are used here indicate both | |||
// hardware support for a particular ISA extension and the presence of | |||
// software support to enable its use. For example, when HWCAP_S390_VX | |||
// is set then both the CPU can execute SIMD instructions and the Linux | |||
// kernel can manage applications using the vector registers and SIMD | |||
// instructions. | |||
// | |||
// See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in | |||
// sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware | |||
// capability bits. They are derived from the information that the | |||
// "store facility list (extended)" instructions provide. | |||
// (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD) | |||
// | |||
// currently used: | |||
// HWCAP_S390_VX - vector facility for z/Architecture (introduced with | |||
// IBM z13), enables level CPU_Z13 (SIMD) | |||
// HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM | |||
// z14), together with VX enables level CPU_Z14 | |||
// (single-precision SIMD instructions) | |||
// | |||
// When you add optimized kernels that make use of other ISA extensions | |||
// (e.g., for exploiting the vector-enhancements facility 2 that was introduced | |||
// with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate | |||
// it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2 | |||
// for the z15 vector enhancements). | |||
// | |||
// To learn the value of hwcaps on a given system, set the environment | |||
// variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running | |||
// LD_SHOW_AUXV=1 /bin/true). | |||
// Also, the init function for dynamic arch support will print hwcaps | |||
// when OPENBLAS_VERBOSE is set to 2 or higher. | |||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
return CPU_Z14; | |||
if (hwcap & HWCAP_S390_VX) | |||
return CPU_Z13; | |||
return CPU_GENERIC; | |||
} | |||
@@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
#else | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
/* Split local region of B into parts */ | |||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ | |||
min_jj = MIN(n_to, js + div_n) - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = 0; jjs < ls - js; jjs += min_jj){ | |||
min_jj = ls - js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
min_jj = min_l - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
min_jj = min_l - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | |||
min_jj = js - ls - min_l - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
min_jj = min_j + js - jjs; | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
#else | |||
@@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" | |||
if (DYNAMIC_ARCH) | |||
if (ARM64) | |||
list(APPEND COMMON_SOURCES dynamic_arm64.c) | |||
elseif (POWER) | |||
list(APPEND COMMON_SOURCES dynamic_power.c) | |||
else () | |||
list(APPEND COMMON_SOURCES dynamic.c) | |||
endif () | |||
@@ -40,7 +40,7 @@ | |||
#include <stdlib.h> | |||
#include "common.h" | |||
#if defined(OS_CYGWIN_NT) && !defined(unlikely) | |||
#if !defined(unlikely) | |||
#ifdef __GNUC__ | |||
#define unlikely(x) __builtin_expect(!!(x), 0) | |||
#else | |||
@@ -391,8 +391,9 @@ int blas_thread_init(void){ | |||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
#if defined(SMP_SERVER) | |||
// Handle lazy re-init of the thread-pool after a POSIX fork | |||
// on Cygwin or as delayed init when a static library is used | |||
if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
#endif | |||
@@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){ | |||
return &gotoblas_NEHALEM; | |||
} | |||
} | |||
if (model == 10) { | |||
if (model == 10 || model == 12){ | |||
// Ice Lake SP | |||
if(support_avx512_bf16()) | |||
return &gotoblas_COOPERLAKE; | |||
@@ -639,12 +639,12 @@ static gotoblas_t *get_coretype(void){ | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; | |||
} | |||
} | |||
} | |||
return NULL; | |||
case 7: | |||
if (model == 10) // Goldmont Plus | |||
return &gotoblas_NEHALEM; | |||
if (model == 14) { | |||
if (model == 13 || model == 14) { | |||
// Ice Lake | |||
if (support_avx512()) | |||
return &gotoblas_SKYLAKEX; | |||
@@ -661,9 +661,8 @@ static gotoblas_t *get_coretype(void){ | |||
} | |||
} | |||
return NULL; | |||
case 9: | |||
case 8: | |||
if (model == 12) { // Tiger Lake | |||
if (model == 12 || model == 13) { // Tiger Lake | |||
if (support_avx512()) | |||
return &gotoblas_SKYLAKEX; | |||
if(support_avx2()){ | |||
@@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){ | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
if (model == 15){ // Sapphire Rapids | |||
if(support_avx512_bf16()) | |||
return &gotoblas_COOPERLAKE; | |||
if (support_avx512()) | |||
return &gotoblas_SKYLAKEX; | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; | |||
} | |||
} | |||
return NULL; | |||
case 9: | |||
if (model == 7 || model == 10) { // Alder Lake | |||
if(support_avx2()){ | |||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
return &gotoblas_HASWELL; | |||
} | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; | |||
} | |||
} | |||
if (model == 14 ) { // Kaby Lake, Coffee Lake | |||
if(support_avx2()) | |||
return &gotoblas_HASWELL; | |||
if(support_avx()) { | |||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
return &gotoblas_SANDYBRIDGE; | |||
} else { | |||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
} | |||
} | |||
return NULL; | |||
case 10: | |||
if (model == 5 || model == 6) { | |||
if(support_avx2()) | |||
@@ -1018,7 +1061,13 @@ void gotoblas_dynamic_init(void) { | |||
#ifdef ARCH_X86 | |||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; | |||
#else | |||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; | |||
if (gotoblas == NULL) { | |||
if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE; | |||
else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX; | |||
else if (support_avx2()) gotoblas = &gotoblas_HASWELL; | |||
else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE; | |||
else gotoblas = &gotoblas_PRESCOTT; | |||
} | |||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */ | |||
if (sizeof(void*) == 8) { | |||
if (gotoblas == &gotoblas_KATMAI || | |||
@@ -1,38 +1,7 @@ | |||
#include "common.h" | |||
#include "cpuid_zarch.h" | |||
#include <stdbool.h> | |||
// Guard the use of getauxval() on glibc version >= 2.16 | |||
#ifdef __GLIBC__ | |||
#include <features.h> | |||
#if __GLIBC_PREREQ(2, 16) | |||
#include <sys/auxv.h> | |||
#define HAVE_GETAUXVAL 1 | |||
static unsigned long get_hwcap(void) | |||
{ | |||
unsigned long hwcap = getauxval(AT_HWCAP); | |||
char *maskenv; | |||
// honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
maskenv = getenv("LD_HWCAP_MASK"); | |||
if (maskenv) | |||
hwcap &= strtoul(maskenv, NULL, 0); | |||
return hwcap; | |||
// note that a missing auxval is interpreted as no capabilities | |||
// available, which is safe. | |||
} | |||
#else // __GLIBC_PREREQ(2, 16) | |||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
static unsigned long get_hwcap(void) { | |||
// treat missing support for getauxval() as no capabilities available, | |||
// which is safe. | |||
return 0; | |||
} | |||
#endif // __GLIBC_PREREQ(2, 16) | |||
#endif // __GLIBC | |||
extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||
#ifdef DYN_Z13 | |||
@@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14; | |||
#define NUM_CORETYPES 4 | |||
extern int openblas_verbose(); | |||
extern void openblas_warning(int verbose, const char* msg); | |||
static char* corename[] = { | |||
"unknown", | |||
"Z13", | |||
"Z14", | |||
"ZARCH_GENERIC", | |||
}; | |||
char* gotoblas_corename(void) { | |||
#ifdef DYN_Z13 | |||
if (gotoblas == &gotoblas_Z13) return corename[1]; | |||
if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13]; | |||
#endif | |||
#ifdef DYN_Z14 | |||
if (gotoblas == &gotoblas_Z14) return corename[2]; | |||
if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14]; | |||
#endif | |||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC]; | |||
return corename[0]; | |||
return "unknown"; | |||
} | |||
#ifndef HWCAP_S390_VXE | |||
@@ -79,25 +42,28 @@ char* gotoblas_corename(void) { | |||
*/ | |||
static gotoblas_t* get_coretype(void) { | |||
unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||
int cpu = detect(); | |||
#ifdef DYN_Z14 | |||
switch(cpu) { | |||
// z14 and z15 systems: exploit Vector Facility (SIMD) and | |||
// Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
case CPU_Z14: | |||
#ifdef DYN_Z14 | |||
return &gotoblas_Z14; | |||
#endif | |||
#ifdef DYN_Z13 | |||
// z13: Vector Facility (SIMD for double) | |||
if (hwcap & HWCAP_S390_VX) | |||
case CPU_Z13: | |||
#ifdef DYN_Z13 | |||
return &gotoblas_Z13; | |||
#endif | |||
default: | |||
// fallback in case of missing compiler support, systems before z13, or | |||
// when the OS does not advertise support for the Vector Facility (e.g., | |||
// missing support in the OS kernel) | |||
return &gotoblas_ZARCH_GENERIC; | |||
return &gotoblas_ZARCH_GENERIC; | |||
} | |||
} | |||
static gotoblas_t* force_coretype(char* coretype) { | |||
@@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) { | |||
for (i = 0; i < NUM_CORETYPES; i++) | |||
{ | |||
if (!strncasecmp(coretype, corename[i], 20)) | |||
if (!strncasecmp(coretype, cpuname[i], 20)) | |||
{ | |||
found = i; | |||
break; | |||
} | |||
} | |||
if (found == 1) { | |||
if (found == CPU_Z13) { | |||
#ifdef DYN_Z13 | |||
return &gotoblas_Z13; | |||
#else | |||
openblas_warning(1, "Z13 support not compiled in"); | |||
return NULL; | |||
#endif | |||
} else if (found == 2) { | |||
} else if (found == CPU_Z14) { | |||
#ifdef DYN_Z14 | |||
return &gotoblas_Z14; | |||
#else | |||
openblas_warning(1, "Z14 support not compiled in"); | |||
return NULL; | |||
#endif | |||
} else if (found == 3) { | |||
} else if (found == CPU_GENERIC) { | |||
return &gotoblas_ZARCH_GENERIC; | |||
} | |||
@@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) { | |||
else | |||
{ | |||
gotoblas = get_coretype(); | |||
if (openblas_verbose() >= 2) { | |||
snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n", | |||
getauxval(AT_HWCAP)); | |||
openblas_warning(2, coremsg); | |||
} | |||
} | |||
if (gotoblas == NULL) | |||
@@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) { | |||
} | |||
if (gotoblas && gotoblas->init) { | |||
strncpy(coren, gotoblas_corename(), 20); | |||
sprintf(coremsg, "Core: %s\n", coren); | |||
openblas_warning(2, coremsg); | |||
if (openblas_verbose() >= 2) { | |||
strncpy(coren, gotoblas_corename(), 20); | |||
sprintf(coremsg, "Core: %s\n", coren); | |||
openblas_warning(2, coremsg); | |||
} | |||
gotoblas->init(); | |||
} | |||
else { | |||
@@ -246,6 +246,14 @@ int get_num_procs(void) { | |||
#endif | |||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
#if defined(USE_OPENMP) | |||
#if _OPENMP >= 201511 | |||
nums = omp_get_num_places(); | |||
#endif | |||
return nums; | |||
#endif | |||
#if !defined(OS_LINUX) | |||
return nums; | |||
#endif | |||
@@ -1806,10 +1814,19 @@ int get_num_procs(void) { | |||
#endif | |||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
#if defined(USE_OPENMP) | |||
/* if (omp_get_proc_bind() != omp_proc_bind_false) */ | |||
#if _OPENMP >= 201511 | |||
nums = omp_get_num_places(); | |||
#endif | |||
return nums; | |||
#endif | |||
#if !defined(OS_LINUX) | |||
return nums; | |||
#endif | |||
#if !defined(__GLIBC_PREREQ) | |||
return nums; | |||
#else | |||
@@ -2854,32 +2871,28 @@ void *blas_memory_alloc(int procpos){ | |||
position ++; | |||
} while (position < NUM_BUFFERS); | |||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
if (memory_overflowed) { | |||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
do { | |||
RMB; | |||
do { | |||
RMB; | |||
#if defined(USE_OPENMP) | |||
if (!newmemory[position-NUM_BUFFERS].used) { | |||
blas_lock(&newmemory[position-NUM_BUFFERS].lock); | |||
if (!newmemory[position-NUM_BUFFERS].used) { | |||
blas_lock(&newmemory[position-NUM_BUFFERS].lock); | |||
#endif | |||
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; | |||
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; | |||
#if defined(USE_OPENMP) | |||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||
} | |||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||
} | |||
#endif | |||
position ++; | |||
position ++; | |||
} while (position < 512+NUM_BUFFERS); | |||
} while (position < 512+NUM_BUFFERS); | |||
} | |||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
} | |||
goto error; | |||
allocation : | |||
@@ -2904,7 +2917,7 @@ void *blas_memory_alloc(int procpos){ | |||
func = &memoryalloc[0]; | |||
while ((func != NULL) && (map_address == (void *) -1)) { | |||
while ((*func != NULL) && (map_address == (void *) -1)) { | |||
map_address = (*func)((void *)base_address); | |||
@@ -2984,6 +2997,9 @@ void *blas_memory_alloc(int procpos){ | |||
return (void *)memory[position].addr; | |||
error: | |||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
LOCK_COMMAND(&alloc_lock); | |||
#endif | |||
if (memory_overflowed) goto terminate; | |||
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); | |||
memory_overflowed=1; | |||
@@ -2997,7 +3013,6 @@ void *blas_memory_alloc(int procpos){ | |||
newmemory[i].used = 0; | |||
newmemory[i].lock = 0; | |||
} | |||
newmemory[position-NUM_BUFFERS].used = 1; | |||
allocation2: | |||
newmemory[position-NUM_BUFFERS].used = 1; | |||
@@ -3015,7 +3030,7 @@ allocation2: | |||
func = &memoryalloc[0]; | |||
while ((func != NULL) && (map_address == (void *) -1)) { | |||
while ((*func != NULL) && (map_address == (void *) -1)) { | |||
map_address = (*func)((void *)base_address); | |||
@@ -3069,6 +3084,9 @@ allocation2: | |||
return (void *)newmemory[position-NUM_BUFFERS].addr; | |||
terminate: | |||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
UNLOCK_COMMAND(&alloc_lock); | |||
#endif | |||
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); | |||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); | |||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); | |||
@@ -183,7 +183,7 @@ int get_L2_size(void){ | |||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ | |||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) | |||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
@@ -269,7 +269,7 @@ void blas_set_parameter(void){ | |||
int factor; | |||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ | |||
defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ | |||
defined(SKYLAKEX) || defined(COOPERLAKE) | |||
defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
int size = 16; | |||
#else | |||
int size = get_L2_size(); | |||
@@ -469,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
#endif | |||
#ifdef FORCE_SAPPHIRERAPIDS | |||
#define FORCE | |||
#define FORCE_INTEL | |||
#define ARCHITECTURE "X86" | |||
#ifdef NO_AVX512 | |||
#ifdef NO_AVX2 | |||
#ifdef NO_AVX | |||
#define SUBARCHITECTURE "NEHALEM" | |||
#define ARCHCONFIG "-DNEHALEM " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
#define LIBNAME "nehalem" | |||
#define CORENAME "NEHALEM" | |||
#else | |||
#define SUBARCHITECTURE "SANDYBRIDGE" | |||
#define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
#define LIBNAME "sandybridge" | |||
#define CORENAME "SANDYBRIDGE" | |||
#endif | |||
#else | |||
#define SUBARCHITECTURE "HASWELL" | |||
#define ARCHCONFIG "-DHASWELL " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
#define LIBNAME "haswell" | |||
#define CORENAME "HASWELL" | |||
#endif | |||
#else | |||
#define SUBARCHITECTURE "SAPPHIRERAPIDS" | |||
#define ARCHCONFIG "-DSAPPHIRERAPIDS " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids" | |||
#define LIBNAME "sapphirerapids" | |||
#define CORENAME "SAPPHIRERAPIDS" | |||
#endif | |||
#endif | |||
#ifdef FORCE_ATOM | |||
#define FORCE | |||
#define FORCE_INTEL | |||
@@ -964,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ARCHCONFIG "-DP5600 " \ | |||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ | |||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" | |||
#define LIBNAME "p5600" | |||
#define CORENAME "P5600" | |||
#else | |||
@@ -978,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ARCHCONFIG "-DMIPS1004K " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" | |||
#define LIBNAME "mips1004K" | |||
#define CORENAME "MIPS1004K" | |||
#else | |||
@@ -992,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ARCHCONFIG "-DMIPS24K " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
"-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" | |||
#define LIBNAME "mips24K" | |||
#define CORENAME "MIPS24K" | |||
#else | |||
@@ -1149,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#else | |||
#endif | |||
#ifdef FORCE_ARMV8SVE | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "ARMV8SVE" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DARMV8SVE " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
#define LIBNAME "armv8sve" | |||
#define CORENAME "ARMV8SVE" | |||
#endif | |||
#ifdef FORCE_ARMV8 | |||
#define FORCE | |||
@@ -1375,6 +1438,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define CORENAME "VORTEX" | |||
#endif | |||
#ifdef FORCE_A64FX | |||
#define ARMV8 | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "A64FX" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DA64FX " \ | |||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
"-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ | |||
"-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" | |||
#define LIBNAME "a64fx" | |||
#define CORENAME "A64FX" | |||
#else | |||
#endif | |||
#ifdef FORCE_ZARCH_GENERIC | |||
#define FORCE | |||
#define ARCHITECTURE "ZARCH" | |||
@@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
if (n == 0) return; | |||
if (incx == 1 && trans == 0 && n < 50) { | |||
buffer = NULL; | |||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); | |||
return; | |||
} | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
@@ -42,14 +42,20 @@ | |||
#include "functable.h" | |||
#endif | |||
#ifndef CBLAS | |||
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ | |||
BLASLONG n = *N; | |||
BLASLONG incx = *INCX; | |||
BLASLONG incy = *INCY; | |||
FLOAT c = *C; | |||
FLOAT s = *S; | |||
#else | |||
void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) { | |||
FLOAT *x = (FLOAT*) VX; | |||
FLOAT *y = (FLOAT*) VY; | |||
#endif /* CBLAS */ | |||
PRINT_DEBUG_NAME; | |||
if (n <= 0) return; | |||
@@ -4,8 +4,16 @@ | |||
#include "functable.h" | |||
#endif | |||
#ifndef CBLAS | |||
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ | |||
#else | |||
void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { | |||
FLOAT *DA = (FLOAT*) VDA; | |||
FLOAT *DB = (FLOAT*) VDB; | |||
FLOAT *S = (FLOAT*) VS; | |||
#endif /* CBLAS */ | |||
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) | |||
long double da_r = *(DA + 0); | |||
@@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
if (n == 0) return; | |||
if (incx == 1 && trans == 0 && n < 50) { | |||
buffer = NULL; | |||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); | |||
return; | |||
} | |||
IDEBUG_START; | |||
FUNCTION_PROFILE_START(); | |||
@@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
if (${DYNAMIC_ARCH}) | |||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
endif () | |||
ParseMakefileVars("${KERNELDIR}/KERNEL") | |||
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") | |||
SetDefaultL1() | |||
SetDefaultL2() | |||
SetDefaultL3() | |||
ParseMakefileVars("${KERNELDIR}/KERNEL") | |||
ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") | |||
set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h) | |||
if(NOT NO_LAPACK) | |||
@@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
# Makefile.L3 | |||
set(USE_TRMM false) | |||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) | |||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) | |||
set(USE_TRMM true) | |||
endif () | |||
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||
@@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) | |||
# symm for s and d | |||
if (NOT DEFINED ${float_char}SYMMUCOPY_M) | |||
set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") | |||
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") | |||
endif() | |||
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
# These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. | |||
# Could simplify it a bit by pairing up by -UUNIT/-DUNIT. | |||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
if (NOT DEFINED ${float_char}TRMMUNCOPY_M) | |||
set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
else () | |||
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") | |||
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") | |||
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") | |||
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") | |||
endif () | |||
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
@@ -578,11 +596,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||
endif () | |||
if (BUILD_BFLOAT16) | |||
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) | |||
@@ -616,11 +634,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") | |||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") | |||
endif () | |||
endif () | |||
@@ -31,7 +31,22 @@ ifdef NO_AVX2 | |||
endif | |||
ifdef TARGET_CORE | |||
ifeq ($(TARGET_CORE), COOPERLAKE) | |||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
ifeq ($(GCCVERSIONGTEQ10), 1) | |||
override CFLAGS += -march=sapphirerapids | |||
else | |||
override CFLAGS += -march=skylake-avx512 -mavx512f | |||
endif | |||
ifeq ($(OSNAME), CYGWIN_NT) | |||
override CFLAGS += -fno-asynchronous-unwind-tables | |||
endif | |||
ifeq ($(OSNAME), WINNT) | |||
ifeq ($(C_COMPILER), GCC) | |||
override CFLAGS += -fno-asynchronous-unwind-tables | |||
endif | |||
endif | |||
else ifeq ($(TARGET_CORE), COOPERLAKE) | |||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
ifeq ($(GCCVERSIONGTEQ10), 1) | |||
override CFLAGS += -march=cooperlake | |||
@@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(CORE), SAPPHIRERAPIDS) | |||
USE_TRMM = 1 | |||
endif | |||
ifeq ($(CORE), ZEN) | |||
USE_TRMM = 1 | |||
endif | |||
@@ -1479,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT | |||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ | |||
ifdef STRMMUNCOPY_M | |||
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef STRMMLNCOPY_M | |||
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef STRMMUTCOPY_M | |||
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef STRMMLTCOPY_M | |||
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -1527,29 +1563,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N | |||
$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
ifdef DTRMMUNCOPY_M | |||
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef DTRMMLNCOPY_M | |||
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef DTRMMUTCOPY_M | |||
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
endif | |||
ifdef DTRMMLTCOPY_M | |||
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
else | |||
$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
endif | |||
$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
@@ -1773,11 +1841,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N). | |||
$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
ifdef SSYMMUCOPY_M | |||
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
else | |||
$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
endif | |||
ifdef SSYMMLCOPY_M | |||
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
else | |||
$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
endif | |||
$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
@@ -1785,11 +1863,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). | |||
$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
ifdef DSYMMUCOPY_M | |||
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
else | |||
$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
endif | |||
ifdef DSYMMLCOPY_M | |||
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
else | |||
$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
endif | |||
$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c | |||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
@@ -0,0 +1,183 @@ | |||
SAMINKERNEL = ../arm/amin.c | |||
DAMINKERNEL = ../arm/amin.c | |||
CAMINKERNEL = ../arm/zamin.c | |||
ZAMINKERNEL = ../arm/zamin.c | |||
SMAXKERNEL = ../arm/max.c | |||
DMAXKERNEL = ../arm/max.c | |||
SMINKERNEL = ../arm/min.c | |||
DMINKERNEL = ../arm/min.c | |||
ISAMINKERNEL = ../arm/iamin.c | |||
IDAMINKERNEL = ../arm/iamin.c | |||
ICAMINKERNEL = ../arm/izamin.c | |||
IZAMINKERNEL = ../arm/izamin.c | |||
ISMAXKERNEL = ../arm/imax.c | |||
IDMAXKERNEL = ../arm/imax.c | |||
ISMINKERNEL = ../arm/imin.c | |||
IDMINKERNEL = ../arm/imin.c | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
SAMAXKERNEL = amax.S | |||
DAMAXKERNEL = amax.S | |||
CAMAXKERNEL = zamax.S | |||
ZAMAXKERNEL = zamax.S | |||
SAXPYKERNEL = axpy.S | |||
DAXPYKERNEL = axpy.S | |||
CAXPYKERNEL = zaxpy.S | |||
ZAXPYKERNEL = zaxpy.S | |||
SROTKERNEL = rot.S | |||
DROTKERNEL = rot.S | |||
CROTKERNEL = zrot.S | |||
ZROTKERNEL = zrot.S | |||
SSCALKERNEL = scal.S | |||
DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
SGEMVNKERNEL = gemv_n.S | |||
DGEMVNKERNEL = gemv_n.S | |||
CGEMVNKERNEL = zgemv_n.S | |||
ZGEMVNKERNEL = zgemv_n.S | |||
SGEMVTKERNEL = gemv_t.S | |||
DGEMVTKERNEL = gemv_t.S | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SASUMKERNEL = asum.S | |||
DASUMKERNEL = asum.S | |||
CASUMKERNEL = casum.S | |||
ZASUMKERNEL = zasum.S | |||
SCOPYKERNEL = copy.S | |||
DCOPYKERNEL = copy.S | |||
CCOPYKERNEL = copy.S | |||
ZCOPYKERNEL = copy.S | |||
SSWAPKERNEL = swap.S | |||
DSWAPKERNEL = swap.S | |||
CSWAPKERNEL = swap.S | |||
ZSWAPKERNEL = swap.S | |||
ISAMAXKERNEL = iamax.S | |||
IDAMAXKERNEL = iamax.S | |||
ICAMAXKERNEL = izamax.S | |||
IZAMAXKERNEL = izamax.S | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.S | |||
ifneq ($(C_COMPILER), PGI) | |||
SDOTKERNEL = ../generic/dot.c | |||
else | |||
SDOTKERNEL = dot.S | |||
endif | |||
ifneq ($(C_COMPILER), PGI) | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
else | |||
CDOTKERNEL = ../arm/zdot.c | |||
ZDOTKERNEL = ../arm/zdot.c | |||
endif | |||
DSDOTKERNEL = dot.S | |||
DGEMM_BETA = dgemm_beta.S | |||
SGEMM_BETA = sgemm_beta.S | |||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
SSYMMUCOPY_M = symm_ucopy_sve.c | |||
SSYMMLCOPY_M = symm_lcopy_sve.c | |||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
DSYMMUCOPY_M = symm_ucopy_sve.c | |||
DSYMMLCOPY_M = symm_lcopy_sve.c | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) |
@@ -0,0 +1,183 @@ | |||
SAMINKERNEL = ../arm/amin.c | |||
DAMINKERNEL = ../arm/amin.c | |||
CAMINKERNEL = ../arm/zamin.c | |||
ZAMINKERNEL = ../arm/zamin.c | |||
SMAXKERNEL = ../arm/max.c | |||
DMAXKERNEL = ../arm/max.c | |||
SMINKERNEL = ../arm/min.c | |||
DMINKERNEL = ../arm/min.c | |||
ISAMINKERNEL = ../arm/iamin.c | |||
IDAMINKERNEL = ../arm/iamin.c | |||
ICAMINKERNEL = ../arm/izamin.c | |||
IZAMINKERNEL = ../arm/izamin.c | |||
ISMAXKERNEL = ../arm/imax.c | |||
IDMAXKERNEL = ../arm/imax.c | |||
ISMINKERNEL = ../arm/imin.c | |||
IDMINKERNEL = ../arm/imin.c | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
SAMAXKERNEL = amax.S | |||
DAMAXKERNEL = amax.S | |||
CAMAXKERNEL = zamax.S | |||
ZAMAXKERNEL = zamax.S | |||
SAXPYKERNEL = axpy.S | |||
DAXPYKERNEL = axpy.S | |||
CAXPYKERNEL = zaxpy.S | |||
ZAXPYKERNEL = zaxpy.S | |||
SROTKERNEL = rot.S | |||
DROTKERNEL = rot.S | |||
CROTKERNEL = zrot.S | |||
ZROTKERNEL = zrot.S | |||
SSCALKERNEL = scal.S | |||
DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
SGEMVNKERNEL = gemv_n.S | |||
DGEMVNKERNEL = gemv_n.S | |||
CGEMVNKERNEL = zgemv_n.S | |||
ZGEMVNKERNEL = zgemv_n.S | |||
SGEMVTKERNEL = gemv_t.S | |||
DGEMVTKERNEL = gemv_t.S | |||
CGEMVTKERNEL = zgemv_t.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SASUMKERNEL = asum.S | |||
DASUMKERNEL = asum.S | |||
CASUMKERNEL = casum.S | |||
ZASUMKERNEL = zasum.S | |||
SCOPYKERNEL = copy.S | |||
DCOPYKERNEL = copy.S | |||
CCOPYKERNEL = copy.S | |||
ZCOPYKERNEL = copy.S | |||
SSWAPKERNEL = swap.S | |||
DSWAPKERNEL = swap.S | |||
CSWAPKERNEL = swap.S | |||
ZSWAPKERNEL = swap.S | |||
ISAMAXKERNEL = iamax.S | |||
IDAMAXKERNEL = iamax.S | |||
ICAMAXKERNEL = izamax.S | |||
IZAMAXKERNEL = izamax.S | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.S | |||
ifneq ($(C_COMPILER), PGI) | |||
SDOTKERNEL = ../generic/dot.c | |||
else | |||
SDOTKERNEL = dot.S | |||
endif | |||
ifneq ($(C_COMPILER), PGI) | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
else | |||
CDOTKERNEL = ../arm/zdot.c | |||
ZDOTKERNEL = ../arm/zdot.c | |||
endif | |||
DSDOTKERNEL = dot.S | |||
DGEMM_BETA = dgemm_beta.S | |||
SGEMM_BETA = sgemm_beta.S | |||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S | |||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S | |||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c | |||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c | |||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
SSYMMUCOPY_M = symm_ucopy_sve.c | |||
SSYMMLCOPY_M = symm_lcopy_sve.c | |||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
DSYMMUCOPY_M = symm_ucopy_sve.c | |||
DSYMMLCOPY_M = symm_lcopy_sve.c | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
endif | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) |
@@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c | |||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
@@ -169,7 +169,7 @@ endif | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
@@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
@@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c | |||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
@@ -169,7 +169,7 @@ endif | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c | |||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
@@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c | |||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
@@ -1 +1 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8 | |||
include $(KERNELDIR)/KERNEL.NEOVERSEN1 |
@@ -0,0 +1,898 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
#define FMLA_RI "fmla " | |||
#define FMLA_IR "fmla " | |||
#define FMLA_II "fmls " | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
#define FMLA_RI "fmls " | |||
#define FMLA_IR "fmla " | |||
#define FMLA_II "fmla " | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
#define FMLA_RI "fmla " | |||
#define FMLA_IR "fmls " | |||
#define FMLA_II "fmla " | |||
#else | |||
#define FMLA_RI "fmls " | |||
#define FMLA_IR "fmls " | |||
#define FMLA_II "fmls " | |||
#endif | |||
#define FMLA_RR "fmla " | |||
static inline void store_m8n1_contracted(float *C, | |||
float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i, | |||
float alphar, float alphai) { | |||
float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8); | |||
ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar); | |||
ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar); | |||
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai); | |||
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai); | |||
ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai); | |||
ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai); | |||
ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar); | |||
ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar); | |||
vst2q_f32(C, ld1); | |||
vst2q_f32(C + 8, ld2); | |||
} | |||
static inline void kernel_8x4(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
const float *c_pref = C; | |||
float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i; | |||
float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i; | |||
/** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */ | |||
/** v0-v1 and v10-v11 for B, v2-v9 for A */ | |||
__asm__ __volatile__( | |||
"cmp %[K],#0; mov %[c_pref],%[C]\n\t" | |||
"movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
"movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
"movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
"movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"movi %[c6i].16b,#0\n\t" | |||
"movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t" | |||
"movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" | |||
"beq 4f\n\t" | |||
"cmp %[K],#2\n\t" | |||
"ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t" | |||
"ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t" | |||
"mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t" | |||
"bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t" | |||
"bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t" | |||
"blt 3f; beq 2f\n\t" | |||
"1:\n\t" | |||
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" | |||
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" | |||
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t" | |||
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" | |||
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" | |||
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" | |||
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" | |||
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" | |||
"fmov v5.d[1],x0; fmov d1,x2\n\t" | |||
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t" | |||
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t" | |||
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" | |||
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" | |||
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" | |||
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t" | |||
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" | |||
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" | |||
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" | |||
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" | |||
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" | |||
"fmov v7.d[1],x0; fmov d10,x5\n\t" | |||
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" | |||
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t" | |||
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" | |||
"fmov v10.d[1],x6; fmov d11,x2\n\t" | |||
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t" | |||
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t" | |||
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t" | |||
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" | |||
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" | |||
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t" | |||
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t" | |||
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" | |||
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" | |||
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t" | |||
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" | |||
"fmov v9.d[1],x0; fmov d0,x5\n\t" | |||
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t" | |||
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" | |||
FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" | |||
"fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t" | |||
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t" | |||
FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" | |||
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" | |||
"fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t" | |||
FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" | |||
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t" | |||
FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t" | |||
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t" | |||
FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t" | |||
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t" | |||
FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t" | |||
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t" | |||
FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t" | |||
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t" | |||
FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" | |||
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" | |||
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" | |||
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" | |||
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" | |||
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" | |||
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" | |||
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" | |||
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" | |||
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" | |||
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" | |||
"bgt 1b; blt 3f\n\t" | |||
"2:\n\t" | |||
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" | |||
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" | |||
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t" | |||
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" | |||
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" | |||
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" | |||
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" | |||
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" | |||
"fmov v5.d[1],x0; fmov d1,x2\n\t" | |||
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t" | |||
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t" | |||
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" | |||
"fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" | |||
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" | |||
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t" | |||
FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" | |||
"fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" | |||
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" | |||
FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" | |||
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" | |||
"fmov v7.d[1],x0; fmov d10,x5\n\t" | |||
FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" | |||
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" | |||
FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" | |||
"fmov v10.d[1],x6; fmov d11,x2\n\t" | |||
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" | |||
FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" | |||
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" | |||
"fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" | |||
FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" | |||
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t" | |||
FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" | |||
"fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" | |||
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" | |||
FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t" | |||
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" | |||
"fmov v9.d[1],x0\n\t" | |||
FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" | |||
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" | |||
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" | |||
FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" | |||
FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t" | |||
FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t" | |||
FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t" | |||
FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t" | |||
FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" | |||
FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" | |||
FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" | |||
FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" | |||
FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" | |||
FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" | |||
FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" | |||
FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" | |||
FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" | |||
FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" | |||
FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" | |||
"b 4f\n\t" | |||
"3:\n\t" | |||
"fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" | |||
FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" | |||
FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t" | |||
FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" | |||
"fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" | |||
FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" | |||
FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t" | |||
FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" | |||
"fmov v5.d[1],x0; fmov d1,x2\n\t" | |||
FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t" | |||
FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t" | |||
FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" | |||
"fmov v1.d[1],x4\n\t" | |||
FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t" | |||
FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t" | |||
FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t" | |||
FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t" | |||
FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t" | |||
FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" | |||
FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t" | |||
FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" | |||
FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t" | |||
FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" | |||
FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" | |||
FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" | |||
"4:\n\t" | |||
"mov %[c_pref],%[C]\n\t" | |||
"zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
"zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
"zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" | |||
"zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t" | |||
"zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t" | |||
"zip2 %[c2i].2d,v4.2d,v5.2d\n\t" | |||
"zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t" | |||
"zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t" | |||
"zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t" | |||
"zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t" | |||
"zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t" | |||
"zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t" | |||
"zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t" | |||
"zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t" | |||
"zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t" | |||
"zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t" | |||
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), | |||
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), | |||
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), | |||
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), | |||
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref) | |||
:[C]"r"(C), [LDC]"r"(LDC) | |||
:"cc","memory","x0","x1","x2","x3","x4","x5","x6", | |||
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"); | |||
store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2; | |||
store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2; | |||
store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2; | |||
store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai); | |||
} | |||
static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc, | |||
float32x4_t a, float32x4_t b) { | |||
acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0); | |||
acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1); | |||
acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2); | |||
acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3); | |||
return acc; | |||
} | |||
static inline float32x4x4_t expand_alpha(float alphar, float alphai) { | |||
float32x4x4_t ret; | |||
const float maskp[] = { -1, 1, -1, 1 }; | |||
const float maskn[] = { 1, -1, 1, -1 }; | |||
const float32x4_t vrevp = vld1q_f32(maskp); | |||
const float32x4_t vrevn = vld1q_f32(maskn); | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
ret.val[0] = vdupq_n_f32(alphar); | |||
ret.val[1] = vdupq_n_f32(-alphai); | |||
ret.val[2] = vmulq_f32(ret.val[1], vrevn); | |||
ret.val[3] = vmulq_f32(ret.val[0], vrevp); | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
ret.val[0] = vdupq_n_f32(alphar); | |||
ret.val[1] = vdupq_n_f32(alphai); | |||
ret.val[2] = vmulq_f32(ret.val[1], vrevp); | |||
ret.val[3] = vmulq_f32(ret.val[0], vrevn); | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
ret.val[2] = vdupq_n_f32(alphai); | |||
ret.val[3] = vdupq_n_f32(alphar); | |||
ret.val[0] = vmulq_f32(ret.val[3], vrevn); | |||
ret.val[1] = vmulq_f32(ret.val[2], vrevp); | |||
#else | |||
ret.val[2] = vdupq_n_f32(alphai); | |||
ret.val[3] = vdupq_n_f32(-alphar); | |||
ret.val[0] = vmulq_f32(ret.val[3], vrevp); | |||
ret.val[1] = vmulq_f32(ret.val[2], vrevn); | |||
#endif | |||
return ret; | |||
} | |||
static inline void store_expanded_m2n2(float *C, BLASLONG LDC, | |||
float32x4x4_t acc, float32x4x4_t expanded_alpha) { | |||
float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2); | |||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); | |||
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]); | |||
acc.val[0] = vrev64q_f32(acc.val[0]); | |||
acc.val[2] = vrev64q_f32(acc.val[2]); | |||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); | |||
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]); | |||
acc.val[1] = vrev64q_f32(acc.val[1]); | |||
acc.val[3] = vrev64q_f32(acc.val[3]); | |||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); | |||
ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]); | |||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); | |||
ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]); | |||
vst1q_f32(C, ld1); | |||
vst1q_f32(C + LDC * 2, ld2); | |||
} | |||
static inline float32x4x4_t init_expanded_m2n2() { | |||
float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0), | |||
vdupq_n_f32(0), vdupq_n_f32(0) }}; | |||
return ret; | |||
} | |||
static inline void kernel_4x4(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x4x4_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m2n2(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4), | |||
b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b1); | |||
c3 = acc_expanded_m2n2(c3, a1, b2); | |||
c4 = acc_expanded_m2n2(c4, a2, b2); | |||
c1 = acc_expanded_m2n2(c1, a3, b3); | |||
c2 = acc_expanded_m2n2(c2, a4, b3); | |||
c3 = acc_expanded_m2n2(c3, a3, b4); | |||
c4 = acc_expanded_m2n2(c4, a4, b4); | |||
} | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b1); | |||
c3 = acc_expanded_m2n2(c3, a1, b2); | |||
c4 = acc_expanded_m2n2(c4, a2, b2); | |||
} | |||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
store_expanded_m2n2(C + 4, LDC, c2, e_alpha); | |||
C += LDC * 4; | |||
store_expanded_m2n2(C, LDC, c3, e_alpha); | |||
store_expanded_m2n2(C + 4, LDC, c4, e_alpha); | |||
} | |||
static inline void kernel_8x2(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x4x4_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m2n2(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); | |||
float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20); | |||
float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b1); | |||
c3 = acc_expanded_m2n2(c3, a3, b1); | |||
c4 = acc_expanded_m2n2(c4, a4, b1); | |||
c1 = acc_expanded_m2n2(c1, a5, b2); | |||
c2 = acc_expanded_m2n2(c2, a6, b2); | |||
c3 = acc_expanded_m2n2(c3, a7, b2); | |||
c4 = acc_expanded_m2n2(c4, a8, b2); | |||
} | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); | |||
float32x4_t b1 = vld1q_f32(sb); | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b1); | |||
c3 = acc_expanded_m2n2(c3, a3, b1); | |||
c4 = acc_expanded_m2n2(c4, a4, b1); | |||
} | |||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
store_expanded_m2n2(C + 4, LDC, c2, e_alpha); | |||
store_expanded_m2n2(C + 8, LDC, c3, e_alpha); | |||
store_expanded_m2n2(C + 12, LDC, c4, e_alpha); | |||
} | |||
static inline void kernel_4x2(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x4x4_t c1, c2; | |||
c1 = c2 = init_expanded_m2n2(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b1); | |||
c1 = acc_expanded_m2n2(c1, a3, b2); | |||
c2 = acc_expanded_m2n2(c2, a4, b2); | |||
} | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
float32x4_t b1 = vld1q_f32(sb); | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b1); | |||
} | |||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
store_expanded_m2n2(C + 4, LDC, c2, e_alpha); | |||
} | |||
static inline void kernel_2x4(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x4x4_t c1, c2; | |||
c1 = c2 = init_expanded_m2n2(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); | |||
float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a1, b2); | |||
c1 = acc_expanded_m2n2(c1, a2, b3); | |||
c2 = acc_expanded_m2n2(c2, a2, b4); | |||
} | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa); | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a1, b2); | |||
} | |||
float32x4x4_t e_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n2(C, LDC, c1, e_alpha); | |||
store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha); | |||
} | |||
static inline void kernel_2x2(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x4x4_t c1, c2; | |||
c1 = c2 = init_expanded_m2n2(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; | |||
float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
c2 = acc_expanded_m2n2(c2, a2, b2); | |||
} | |||
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); | |||
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); | |||
c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]); | |||
c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]); | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa); | |||
float32x4_t b1 = vld1q_f32(sb); | |||
c1 = acc_expanded_m2n2(c1, a1, b1); | |||
} | |||
store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai)); | |||
} | |||
static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc, | |||
float32x4_t a, float32x2_t b) { | |||
acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0); | |||
acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1); | |||
return acc; | |||
} | |||
static inline void store_expanded_m2n1(float *C, | |||
float32x4x2_t acc, float32x4x4_t expanded_alpha) { | |||
float32x4_t ld1 = vld1q_f32(C); | |||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); | |||
acc.val[0] = vrev64q_f32(acc.val[0]); | |||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); | |||
acc.val[1] = vrev64q_f32(acc.val[1]); | |||
ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); | |||
ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); | |||
vst1q_f32(C, ld1); | |||
} | |||
static inline float32x4x2_t init_expanded_m2n1() { | |||
float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }}; | |||
return ret; | |||
} | |||
static inline void kernel_8x1(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K) { | |||
float32x4x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m2n1(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12), | |||
a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20), | |||
a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; | |||
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; | |||
c1 = acc_expanded_m2n1(c1, a1, b1); | |||
c2 = acc_expanded_m2n1(c2, a2, b1); | |||
c3 = acc_expanded_m2n1(c3, a3, b1); | |||
c4 = acc_expanded_m2n1(c4, a4, b1); | |||
c1 = acc_expanded_m2n1(c1, a5, b2); | |||
c2 = acc_expanded_m2n1(c2, a6, b2); | |||
c3 = acc_expanded_m2n1(c3, a7, b2); | |||
c4 = acc_expanded_m2n1(c4, a8, b2); | |||
} | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); | |||
float32x2_t b1 = vld1_f32(sb); | |||
c1 = acc_expanded_m2n1(c1, a1, b1); | |||
c2 = acc_expanded_m2n1(c2, a2, b1); | |||
c3 = acc_expanded_m2n1(c3, a3, b1); | |||
c4 = acc_expanded_m2n1(c4, a4, b1); | |||
} | |||
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n1(C, c1, expanded_alpha); | |||
store_expanded_m2n1(C + 4, c2, expanded_alpha); | |||
store_expanded_m2n1(C + 8, c3, expanded_alpha); | |||
store_expanded_m2n1(C + 12, c4, expanded_alpha); | |||
} | |||
static inline void kernel_4x1(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K) { | |||
float32x4x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m2n1(); | |||
for (; K > 1; K -= 2) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; | |||
c1 = acc_expanded_m2n1(c1, a1, b1); | |||
c2 = acc_expanded_m2n1(c2, a2, b1); | |||
c3 = acc_expanded_m2n1(c3, a3, b2); | |||
c4 = acc_expanded_m2n1(c4, a4, b2); | |||
} | |||
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); | |||
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); | |||
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); | |||
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); | |||
if (K) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); | |||
float32x2_t b1 = vld1_f32(sb); | |||
c1 = acc_expanded_m2n1(c1, a1, b1); | |||
c2 = acc_expanded_m2n1(c2, a2, b1); | |||
} | |||
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n1(C, c1, expanded_alpha); | |||
store_expanded_m2n1(C + 4, c2, expanded_alpha); | |||
} | |||
static inline void kernel_2x1(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K) { | |||
float32x4x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m2n1(); | |||
for (; K > 3; K -= 4) { | |||
float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), | |||
a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; | |||
float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2), | |||
b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8; | |||
c1 = acc_expanded_m2n1(c1, a1, b1); | |||
c2 = acc_expanded_m2n1(c2, a2, b2); | |||
c3 = acc_expanded_m2n1(c3, a3, b3); | |||
c4 = acc_expanded_m2n1(c4, a4, b4); | |||
} | |||
c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); | |||
c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); | |||
c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); | |||
c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); | |||
c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); | |||
c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); | |||
for (; K; K--) { | |||
float32x4_t a1 = vld1q_f32(sa); sa += 4; | |||
float32x2_t b1 = vld1_f32(sb); sb += 2; | |||
c1 = acc_expanded_m2n1(c1, a1, b1); | |||
} | |||
float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); | |||
store_expanded_m2n1(C, c1, expanded_alpha); | |||
} | |||
static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) { | |||
float32x2x4_t ret; | |||
const float maskp[] = { -1, 1 }; | |||
const float maskn[] = { 1, -1 }; | |||
const float32x2_t vrevp = vld1_f32(maskp); | |||
const float32x2_t vrevn = vld1_f32(maskn); | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
ret.val[0] = vdup_n_f32(alphar); | |||
ret.val[1] = vdup_n_f32(-alphai); | |||
ret.val[2] = vmul_f32(ret.val[1], vrevn); | |||
ret.val[3] = vmul_f32(ret.val[0], vrevp); | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
ret.val[0] = vdup_n_f32(alphar); | |||
ret.val[1] = vdup_n_f32(alphai); | |||
ret.val[2] = vmul_f32(ret.val[1], vrevp); | |||
ret.val[3] = vmul_f32(ret.val[0], vrevn); | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
ret.val[2] = vdup_n_f32(alphai); | |||
ret.val[3] = vdup_n_f32(alphar); | |||
ret.val[0] = vmul_f32(ret.val[3], vrevn); | |||
ret.val[1] = vmul_f32(ret.val[2], vrevp); | |||
#else | |||
ret.val[2] = vdup_n_f32(alphai); | |||
ret.val[3] = vdup_n_f32(-alphar); | |||
ret.val[0] = vmul_f32(ret.val[3], vrevp); | |||
ret.val[1] = vmul_f32(ret.val[2], vrevn); | |||
#endif | |||
return ret; | |||
} | |||
static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc, | |||
float32x2_t a, float32x2_t b) { | |||
acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0); | |||
acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1); | |||
return acc; | |||
} | |||
static inline void store_expanded_m1n1(float *C, | |||
float32x2x2_t acc, float32x2x4_t expanded_alpha) { | |||
float32x2_t ld1 = vld1_f32(C); | |||
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]); | |||
acc.val[0] = vrev64_f32(acc.val[0]); | |||
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]); | |||
acc.val[1] = vrev64_f32(acc.val[1]); | |||
ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]); | |||
ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]); | |||
vst1_f32(C, ld1); | |||
} | |||
static inline float32x2x2_t init_expanded_m1n1() { | |||
float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }}; | |||
return ret; | |||
} | |||
static inline void kernel_1x4(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m1n1(); | |||
for (; K; K--) { | |||
float32x2_t a1 = vld1_f32(sa); sa += 2; | |||
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); | |||
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); | |||
c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4)); | |||
c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6)); | |||
sb += 8; | |||
} | |||
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); | |||
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; | |||
store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2; | |||
store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2; | |||
store_expanded_m1n1(C, c4, expanded_alpha); | |||
} | |||
static inline void kernel_1x2(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K, BLASLONG LDC) { | |||
float32x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m1n1(); | |||
for (; K > 1; K -= 2) { | |||
float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4; | |||
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); | |||
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); | |||
c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4)); | |||
c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6)); | |||
sb += 8; | |||
} | |||
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); | |||
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); | |||
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); | |||
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); | |||
if (K) { | |||
float32x2_t a1 = vld1_f32(sa); | |||
c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); | |||
c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); | |||
} | |||
float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); | |||
store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; | |||
store_expanded_m1n1(C, c2, expanded_alpha); | |||
} | |||
static inline void kernel_1x1(const float *sa, const float *sb, float *C, | |||
float alphar, float alphai, BLASLONG K) { | |||
float32x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init_expanded_m1n1(); | |||
for (; K > 3; K -= 4) { | |||
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); | |||
c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2)); | |||
c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4)); | |||
c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6)); | |||
sa += 8; sb += 8; | |||
} | |||
c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); | |||
c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); | |||
c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); | |||
c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); | |||
c1.val[0] = vadd_f32(c1.val[0], c2.val[0]); | |||
c1.val[1] = vadd_f32(c1.val[1], c2.val[1]); | |||
for (; K; K--) { | |||
c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); | |||
sa += 2; sb += 2; | |||
} | |||
store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai)); | |||
} | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, | |||
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { | |||
BLASLONG n_left = N; | |||
for (; n_left >= 8; n_left -= 8) { | |||
const FLOAT *a_ = sa; | |||
FLOAT *c1_ = C; | |||
FLOAT *c2_ = C + LDC * 8; | |||
const FLOAT *b1_ = sb; | |||
const FLOAT *b2_ = sb + K * 8; | |||
BLASLONG m_left = M; | |||
for (; m_left >= 8; m_left -= 8) { | |||
kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
a_ += 16 * K; | |||
c1_ += 16; | |||
c2_ += 16; | |||
} | |||
if (m_left >= 4) { | |||
m_left -= 4; | |||
kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
a_ += 8 * K; | |||
c1_ += 8; | |||
c2_ += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
a_ += 4 * K; | |||
c1_ += 4; | |||
c2_ += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC); | |||
kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC); | |||
} | |||
C += 16 * LDC; | |||
sb += 16 * K; | |||
} | |||
if (n_left >= 4) { | |||
n_left -= 4; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
BLASLONG m_left = M; | |||
for (; m_left >= 8; m_left -= 8) { | |||
kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
a_ += 16 * K; | |||
c_ += 16; | |||
} | |||
if (m_left >= 4) { | |||
m_left -= 4; | |||
kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
a_ += 8 * K; | |||
c_ += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
a_ += 4 * K; | |||
c_ += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC); | |||
} | |||
C += 8 * LDC; | |||
sb += 8 * K; | |||
} | |||
if (n_left >= 2) { | |||
n_left -= 2; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
BLASLONG m_left = M; | |||
for (; m_left >= 8; m_left -= 8) { | |||
kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
a_ += 16 * K; | |||
c_ += 16; | |||
} | |||
if (m_left >= 4) { | |||
m_left -= 4; | |||
kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
a_ += 8 * K; | |||
c_ += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
a_ += 4 * K; | |||
c_ += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC); | |||
} | |||
C += 4 * LDC; | |||
sb += 4 * K; | |||
} | |||
if (n_left) { | |||
BLASLONG m_left = M; | |||
for (; m_left >= 8; m_left -= 8) { | |||
kernel_8x1(sa, sb, C, alphar, alphai, K); | |||
sa += 16 * K; | |||
C += 16; | |||
} | |||
if (m_left >= 4) { | |||
m_left -= 4; | |||
kernel_4x1(sa, sb, C, alphar, alphai, K); | |||
sa += 8 * K; | |||
C += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x1(sa, sb, C, alphar, alphai, K); | |||
sa += 4 * K; | |||
C += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x1(sa, sb, C, alphar, alphai, K); | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,890 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
/********************************************************** | |||
* Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12 | |||
* Operation: C[4][12] += alpha * sa[4][K] * sb[K][12] | |||
* Matrix orders: | |||
* sa: column-major (leading dimension == 4) | |||
* sb: 3 concatenated row-major 4-column submatrices | |||
* C: column-major (leading dimension == LDC) | |||
*********************************************************/ | |||
static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
/** prefetch 4x12 elements from matrix C for RW purpose */ | |||
__asm__ __volatile__( | |||
"mov x0,%[C]\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" | |||
"prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t" | |||
::[C]"r"(C), [LDC]"r"(LDC):"x0"); | |||
/** 3 pointers to 3 submatrices of sb respectively */ | |||
const FLOAT *b1_ = sb; | |||
const FLOAT *b2_ = sb + K * 4; | |||
const FLOAT *b3_ = sb + K * 8; | |||
/** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */ | |||
/** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */ | |||
/** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */ | |||
/** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */ | |||
/** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */ | |||
__asm__ __volatile__( | |||
"cmp %[K],#0\n\t" | |||
/** fill registers holding elements of C with 0.0 */ | |||
"movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t" | |||
"movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t" | |||
"movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t" | |||
"movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t" | |||
"movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t" | |||
"movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t" | |||
"beq 4f; cmp %[K],#2\n\t" | |||
/** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */ | |||
"ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t" | |||
"ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t" | |||
"blt 3f; beq 2f\n\t" | |||
"1:\n\t" | |||
/** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */ | |||
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" | |||
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" | |||
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" | |||
"fmla v10.2d,v0.2d,v4.d[1]\n\t" | |||
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" | |||
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" | |||
"fmla v12.2d,v0.2d,v5.d[0]\n\t" | |||
"fmla v13.2d,v1.2d,v5.d[0]\n\t" | |||
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" | |||
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" | |||
"fmla v15.2d,v1.2d,v5.d[1]\n\t" | |||
"fmla v16.2d,v0.2d,v6.d[0]\n\t" | |||
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" | |||
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" | |||
"fmla v18.2d,v0.2d,v6.d[1]\n\t" | |||
"fmla v19.2d,v1.2d,v6.d[1]\n\t" | |||
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" | |||
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" | |||
"fmla v22.2d,v0.2d,v7.d[1]\n\t" | |||
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" | |||
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" | |||
"fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t" | |||
"fmla v25.2d,v1.2d,v4.d[0]\n\t" | |||
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" | |||
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" | |||
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" | |||
"fmla v28.2d,v0.2d,v5.d[0]\n\t" | |||
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" | |||
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" | |||
"fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t" | |||
"fmla v31.2d,v1.2d,v5.d[1]\n\t" | |||
"ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t" | |||
"fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t" | |||
"fmla v9.2d,v3.2d,v6.d[0]\n\t" | |||
"fmla v10.2d,v2.2d,v6.d[1]\n\t" | |||
"ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t" | |||
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" | |||
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" | |||
"fmla v13.2d,v3.2d,v7.d[0]\n\t" | |||
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" | |||
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" | |||
"fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t" | |||
"fmla v16.2d,v2.2d,v4.d[0]\n\t" | |||
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" | |||
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" | |||
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" | |||
"fmla v19.2d,v3.2d,v4.d[1]\n\t" | |||
"ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t" | |||
"fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t" | |||
"fmla v21.2d,v3.2d,v5.d[0]\n\t" | |||
"fmla v22.2d,v2.2d,v5.d[1]\n\t" | |||
"ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t" | |||
"fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t" | |||
"fmla v24.2d,v2.2d,v6.d[0]\n\t" | |||
"fmla v25.2d,v3.2d,v6.d[0]\n\t" | |||
"ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t" | |||
"fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t" | |||
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" | |||
"fmla v28.2d,v2.2d,v7.d[0]\n\t" | |||
"ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t" | |||
"fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t" | |||
"fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t" | |||
"fmla v31.2d,v3.2d,v7.d[1]\n\t" | |||
"bgt 1b; blt 3f\n\t" | |||
"2:\n\t" | |||
/** tail part with k = 2 */ | |||
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" | |||
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" | |||
"fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" | |||
"fmla v10.2d,v0.2d,v4.d[1]\n\t" | |||
"ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" | |||
"fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" | |||
"fmla v12.2d,v0.2d,v5.d[0]\n\t" | |||
"fmla v13.2d,v1.2d,v5.d[0]\n\t" | |||
"ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" | |||
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" | |||
"fmla v15.2d,v1.2d,v5.d[1]\n\t" | |||
"fmla v16.2d,v0.2d,v6.d[0]\n\t" | |||
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" | |||
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" | |||
"fmla v18.2d,v0.2d,v6.d[1]\n\t" | |||
"fmla v19.2d,v1.2d,v6.d[1]\n\t" | |||
"ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" | |||
"fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
"fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" | |||
"fmla v22.2d,v0.2d,v7.d[1]\n\t" | |||
"ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" | |||
"fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" | |||
"fmla v24.2d,v0.2d,v4.d[0]\n\t" | |||
"fmla v25.2d,v1.2d,v4.d[0]\n\t" | |||
"ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" | |||
"fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" | |||
"fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" | |||
"fmla v28.2d,v0.2d,v5.d[0]\n\t" | |||
"ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" | |||
"fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" | |||
"fmla v30.2d,v0.2d,v5.d[1]\n\t" | |||
"fmla v31.2d,v1.2d,v5.d[1]\n\t" | |||
"fmov v4.d[1],x0\n\t" | |||
"fmla v8.2d,v2.2d,v6.d[0]\n\t" | |||
"fmla v9.2d,v3.2d,v6.d[0]\n\t" | |||
"fmla v10.2d,v2.2d,v6.d[1]\n\t" | |||
"ldr d5,[%[b2_],#48]\n\t" | |||
"fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" | |||
"fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" | |||
"fmla v13.2d,v3.2d,v7.d[0]\n\t" | |||
"ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" | |||
"fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" | |||
"fmla v15.2d,v3.2d,v7.d[1]\n\t" | |||
"fmla v16.2d,v2.2d,v4.d[0]\n\t" | |||
"ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" | |||
"fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" | |||
"fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" | |||
"fmla v19.2d,v3.2d,v4.d[1]\n\t" | |||
"fmov v7.d[1],x0\n\t" | |||
"fmla v20.2d,v2.2d,v5.d[0]\n\t" | |||
"fmla v21.2d,v3.2d,v5.d[0]\n\t" | |||
"fmla v22.2d,v2.2d,v5.d[1]\n\t" | |||
"fmla v23.2d,v3.2d,v5.d[1]\n\t" | |||
"fmla v24.2d,v2.2d,v6.d[0]\n\t" | |||
"fmla v25.2d,v3.2d,v6.d[0]\n\t" | |||
"fmla v26.2d,v2.2d,v6.d[1]\n\t" | |||
"fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" | |||
"fmla v28.2d,v2.2d,v7.d[0]\n\t" | |||
"fmla v29.2d,v3.2d,v7.d[0]\n\t" | |||
"fmla v30.2d,v2.2d,v7.d[1]\n\t" | |||
"fmla v31.2d,v3.2d,v7.d[1]\n\t" | |||
"b 4f\n\t" | |||
"3:\n\t" | |||
/** tail part with k = 1 */ | |||
"ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" | |||
"fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" | |||
"fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t" | |||
"fmla v10.2d,v0.2d,v4.d[1]\n\t" | |||
"fmov v7.d[1],x0\n\t" | |||
"fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t" | |||
"fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t" | |||
"fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t" | |||
"ldr d4,[%[b3_]]\n\t" | |||
"fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" | |||
"fmla v15.2d,v1.2d,v5.d[1]\n\t" | |||
"fmla v16.2d,v0.2d,v6.d[0]\n\t" | |||
"ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" | |||
"fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" | |||
"fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t" | |||
"fmla v19.2d,v1.2d,v6.d[1]\n\t" | |||
"fmov v5.d[1],x0\n\t" | |||
"fmla v20.2d,v0.2d,v7.d[0]\n\t" | |||
"fmla v21.2d,v1.2d,v7.d[0]\n\t" | |||
"fmla v22.2d,v0.2d,v7.d[1]\n\t" | |||
"fmla v23.2d,v1.2d,v7.d[1]\n\t" | |||
"fmla v24.2d,v0.2d,v4.d[0]\n\t" | |||
"fmla v25.2d,v1.2d,v4.d[0]\n\t" | |||
"fmla v26.2d,v0.2d,v4.d[1]\n\t" | |||
"fmla v27.2d,v1.2d,v4.d[1]\n\t" | |||
"fmla v28.2d,v0.2d,v5.d[0]\n\t" | |||
"fmla v29.2d,v1.2d,v5.d[0]\n\t" | |||
"fmla v30.2d,v0.2d,v5.d[1]\n\t" | |||
"fmla v31.2d,v1.2d,v5.d[1]\n\t" | |||
/** store 4x12 elements to C */ | |||
"4:\n\t" | |||
"ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t" | |||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
"fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t" | |||
"fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t" | |||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
"fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t" | |||
"fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t" | |||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
"fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t" | |||
"fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t" | |||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
"fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t" | |||
"fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t" | |||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
"fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t" | |||
"fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t" | |||
"stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" | |||
"stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" | |||
"ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" | |||
"fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t" | |||
"fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t" | |||
"stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t" | |||
:[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K) | |||
:[LDC]"r"(LDC), [alpha]"m"(alpha) | |||
:"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | |||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", | |||
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||
} | |||
/********************************************************** | |||
* Operation: | |||
C[0] += alpha * up[0]; C[1] += alpha * up[1]; | |||
C[2] += alpha * down[0]; C[3] += alpha * down[1]; | |||
*********************************************************/ | |||
static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) { | |||
float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2); | |||
t1 = vfmaq_n_f64(t1, up, alpha); | |||
t2 = vfmaq_n_f64(t2, down, alpha); | |||
vst1q_f64(C, t1); | |||
vst1q_f64(C + 2, t2); | |||
} | |||
/********************************************************** | |||
* Function: dgemm_kernel_arm64_4x4_m4n8 | |||
* Operation: C[4][8] += alpha * sa[4][K] * sb[K][8] | |||
* Matrix orders: | |||
* sa: column-major (leading dimension == 4) | |||
* sb: 2 concatenated row-major 4-column submatrices | |||
* C: column-major (leading dimension == LDC) | |||
*********************************************************/ | |||
static inline void dgemm_kernel_arm64_4x4_m4n8( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
const FLOAT *b1_ = sb; | |||
const FLOAT *b2_ = sb + K * 4; | |||
/** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */ | |||
float64x2_t c11, c12, c13, c14, c15, c16, c17, c18; | |||
float64x2_t c21, c22, c23, c24, c25, c26, c27, c28; | |||
c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0); | |||
c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0); | |||
for (; K; K--) { | |||
float64x2_t a1 = vld1q_f64(sa); | |||
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1 = vld1q_f64(b1_); | |||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
c21 = vfmaq_laneq_f64(c21, a2, b1, 0); | |||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
c22 = vfmaq_laneq_f64(c22, a2, b1, 1); | |||
float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4; | |||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
c23 = vfmaq_laneq_f64(c23, a2, b2, 0); | |||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
c24 = vfmaq_laneq_f64(c24, a2, b2, 1); | |||
float64x2_t b3 = vld1q_f64(b2_); | |||
c15 = vfmaq_laneq_f64(c15, a1, b3, 0); | |||
c25 = vfmaq_laneq_f64(c25, a2, b3, 0); | |||
c16 = vfmaq_laneq_f64(c16, a1, b3, 1); | |||
c26 = vfmaq_laneq_f64(c26, a2, b3, 1); | |||
float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4; | |||
c17 = vfmaq_laneq_f64(c17, a1, b4, 0); | |||
c27 = vfmaq_laneq_f64(c27, a2, b4, 0); | |||
c18 = vfmaq_laneq_f64(c18, a1, b4, 1); | |||
c28 = vfmaq_laneq_f64(c28, a2, b4, 1); | |||
} | |||
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c14, c24, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c15, c25, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c16, c26, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c17, c27, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c18, c28, alpha); | |||
} | |||
/********************************************************** | |||
* Function: dgemm_kernel_arm64_4x4_m4n4 | |||
* Operation: C[4][4] += alpha * sa[4][K] * sb[K][4] | |||
* Matrix orders: | |||
* sa: column-major (leading dimension == 4) | |||
* sb: row-major (leading dimension == 4) | |||
* C: column-major (leading dimension == LDC) | |||
*********************************************************/ | |||
static inline void dgemm_kernel_arm64_4x4_m4n4( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c11, c21, c12, c22, c13, c23, c14, c24; | |||
c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0); | |||
for (; K; K--) { | |||
float64x2_t a1 = vld1q_f64(sa); | |||
float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1 = vld1q_f64(sb); | |||
float64x2_t b2 = vld1q_f64(sb + 2); sb += 4; | |||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
c21 = vfmaq_laneq_f64(c21, a2, b1, 0); | |||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
c22 = vfmaq_laneq_f64(c22, a2, b1, 1); | |||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
c23 = vfmaq_laneq_f64(c23, a2, b2, 0); | |||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
c24 = vfmaq_laneq_f64(c24, a2, b2, 1); | |||
} | |||
dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c14, c24, alpha); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m4n2( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2; | |||
c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2), | |||
a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8; | |||
c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0); | |||
c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0); | |||
c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1); | |||
c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1); | |||
c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0); | |||
c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0); | |||
c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1); | |||
c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1); | |||
} | |||
c11_1 = vaddq_f64(c11_1, c11_2); | |||
c21_1 = vaddq_f64(c21_1, c21_2); | |||
c12_1 = vaddq_f64(c12_1, c12_2); | |||
c22_1 = vaddq_f64(c22_1, c22_2); | |||
if (K) { | |||
float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0); | |||
c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0); | |||
c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1); | |||
c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1); | |||
} | |||
dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC; | |||
dgemm_store_m4n1(C, c12_1, c22_1, alpha); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m4n1( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c11_1, c11_2, c21_1, c21_2; | |||
c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0); | |||
c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0); | |||
c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1); | |||
c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1); | |||
sa += 8; | |||
} | |||
c11_1 = vaddq_f64(c11_1, c11_2); | |||
c21_1 = vaddq_f64(c21_1, c21_2); | |||
if (K) { | |||
double b1 = *sb++; | |||
c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1); | |||
c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1); | |||
sa += 4; | |||
} | |||
dgemm_store_m4n1(C, c11_1, c21_1, alpha); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m2n12( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24; | |||
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = | |||
c21 = c22 = c23 = c24 = vdupq_n_f64(0); | |||
const FLOAT *b1_ = sb; | |||
const FLOAT *b2_ = sb + 4 * K; | |||
const FLOAT *b3_ = b2_ + 4 * K; | |||
for (; K; K--) { | |||
const float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; | |||
c01 = vfmaq_laneq_f64(c01, a1, b1, 0); | |||
c02 = vfmaq_laneq_f64(c02, a1, b1, 1); | |||
c03 = vfmaq_laneq_f64(c03, a1, b2, 0); | |||
c04 = vfmaq_laneq_f64(c04, a1, b2, 1); | |||
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; | |||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4; | |||
c21 = vfmaq_laneq_f64(c21, a1, b1, 0); | |||
c22 = vfmaq_laneq_f64(c22, a1, b1, 1); | |||
c23 = vfmaq_laneq_f64(c23, a1, b2, 0); | |||
c24 = vfmaq_laneq_f64(c24, a1, b2, 1); | |||
} | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha)); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m2n8( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c01, c02, c03, c04, c11, c12, c13, c14; | |||
c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0); | |||
const FLOAT *b1_ = sb; | |||
const FLOAT *b2_ = sb + 4 * K; | |||
for (; K; K--) { | |||
const float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; | |||
c01 = vfmaq_laneq_f64(c01, a1, b1, 0); | |||
c02 = vfmaq_laneq_f64(c02, a1, b1, 1); | |||
c03 = vfmaq_laneq_f64(c03, a1, b2, 0); | |||
c04 = vfmaq_laneq_f64(c04, a1, b2, 1); | |||
b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; | |||
c11 = vfmaq_laneq_f64(c11, a1, b1, 0); | |||
c12 = vfmaq_laneq_f64(c12, a1, b1, 1); | |||
c13 = vfmaq_laneq_f64(c13, a1, b2, 0); | |||
c14 = vfmaq_laneq_f64(c14, a1, b2, 1); | |||
} | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m2n4( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2; | |||
c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2); | |||
float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8; | |||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0); | |||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1); | |||
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0); | |||
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1); | |||
c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0); | |||
c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1); | |||
c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0); | |||
c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1); | |||
} | |||
c1_1 = vaddq_f64(c1_1, c1_2); | |||
c2_1 = vaddq_f64(c2_1, c2_2); | |||
c3_1 = vaddq_f64(c3_1, c3_2); | |||
c4_1 = vaddq_f64(c4_1, c4_2); | |||
if (K) { | |||
float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); | |||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); | |||
c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0); | |||
c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1); | |||
} | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha)); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m2n2( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1_1, c1_2, c2_1, c2_2; | |||
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); | |||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); | |||
c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0); | |||
c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1); | |||
} | |||
c1_1 = vaddq_f64(c1_1, c1_2); | |||
c2_1 = vaddq_f64(c2_1, c2_2); | |||
if (K) { | |||
float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); | |||
c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); | |||
} | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m2n1( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *c, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
for (; K > 3; K -= 4) { | |||
float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4; | |||
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0); | |||
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1); | |||
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0); | |||
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1); | |||
sa += 8; | |||
} | |||
c1 = vaddq_f64(c1, c2); | |||
c3 = vaddq_f64(c3, c4); | |||
c1 = vaddq_f64(c1, c3); | |||
for (; K; K--) { | |||
c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++); | |||
sa += 2; | |||
} | |||
vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha)); | |||
} | |||
static inline void dgemm_store_m1n2(double *C, float64x2_t vc, | |||
double alpha, BLASLONG LDC) { | |||
double c0 = vgetq_lane_f64(vc, 0); | |||
double c1 = vgetq_lane_f64(vc, 1); | |||
C[0] += c0 * alpha; | |||
C[LDC] += c1 * alpha; | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m1n12( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1, c2, c3, c4, c5, c6; | |||
c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0); | |||
const double *b1_ = sb; | |||
const double *b2_ = sb + 4 * K; | |||
const double *b3_ = b2_ + 4 * K; | |||
for (; K; K--) { | |||
const double a1 = *sa++; | |||
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); | |||
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; | |||
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); | |||
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; | |||
c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1); | |||
c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4; | |||
} | |||
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c6, alpha, LDC); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m1n8( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
const double *b1_ = sb; | |||
const double *b2_ = sb + 4 * K; | |||
for (; K; K--) { | |||
const double a1 = *sa++; | |||
c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); | |||
c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; | |||
c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); | |||
c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; | |||
} | |||
dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c4, alpha, LDC); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m1n4( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1_1, c1_2, c2_1, c2_2; | |||
c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0); | |||
c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0); | |||
c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1); | |||
c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8; | |||
} | |||
c1_1 = vaddq_f64(c1_1, c1_2); | |||
c2_1 = vaddq_f64(c2_1, c2_2); | |||
if (K) { | |||
double a1 = *sa++; | |||
c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1); | |||
c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1); | |||
sb += 4; | |||
} | |||
dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2; | |||
dgemm_store_m1n2(C, c2_1, alpha, LDC); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m1n2( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
for (; K > 3; K -= 4) { | |||
float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4; | |||
c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0); | |||
c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1); | |||
c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0); | |||
c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8; | |||
} | |||
c1 = vaddq_f64(c1, c2); | |||
c3 = vaddq_f64(c3, c4); | |||
c1 = vaddq_f64(c1, c3); | |||
for (; K; K--) { | |||
c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++); | |||
sb += 2; | |||
} | |||
dgemm_store_m1n2(C, c1, alpha, LDC); | |||
} | |||
static inline void dgemm_kernel_arm64_4x4_m1n1( | |||
const FLOAT *sa, const FLOAT *sb, FLOAT *C, | |||
BLASLONG K, BLASLONG LDC, FLOAT alpha) { | |||
float64x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = vdupq_n_f64(0); | |||
for (; K > 7; K -= 8) { | |||
c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa)); | |||
c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2)); | |||
c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4)); | |||
c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6)); | |||
sa += 8; sb += 8; | |||
} | |||
c1 = vaddq_f64(c1, c2); | |||
c3 = vaddq_f64(c3, c4); | |||
c1 = vaddq_f64(c1, c3); | |||
double cs1 = vpaddd_f64(c1); | |||
for (; K; K--) { | |||
cs1 += (*sa++) * (*sb++); | |||
} | |||
C[0] += cs1 * alpha; | |||
} | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, | |||
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { | |||
for (; N >= 12; N -= 12) { | |||
BLASLONG m_left = M; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
for (; m_left >= 4; m_left -= 4) { | |||
dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha); | |||
c_ += 4; | |||
a_ += 4 * K; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha); | |||
c_ += 2; | |||
a_ += 2 * K; | |||
} | |||
if (m_left) { | |||
dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha); | |||
} | |||
sb += 12 * K; | |||
C += 12 * LDC; | |||
} | |||
if (N >= 8) { | |||
N -= 8; | |||
BLASLONG m_left = M; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
for (; m_left >= 4; m_left -= 4) { | |||
dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha); | |||
c_ += 4; | |||
a_ += 4 * K; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha); | |||
c_ += 2; | |||
a_ += 2 * K; | |||
} | |||
if (m_left) { | |||
dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha); | |||
} | |||
sb += 8 * K; | |||
C += 8 * LDC; | |||
} else if (N >= 4) { | |||
N -= 4; | |||
BLASLONG m_left = M; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
for (; m_left >= 4; m_left -= 4) { | |||
dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha); | |||
c_ += 4; | |||
a_ += 4 * K; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha); | |||
c_ += 2; | |||
a_ += 2 * K; | |||
} | |||
if (m_left) { | |||
dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha); | |||
} | |||
sb += 4 * K; | |||
C += 4 * LDC; | |||
} | |||
if (N >= 2) { | |||
N -= 2; | |||
BLASLONG m_left = M; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
for (; m_left >= 4; m_left -= 4) { | |||
dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha); | |||
c_ += 4; | |||
a_ += 4 * K; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha); | |||
c_ += 2; | |||
a_ += 2 * K; | |||
} | |||
if (m_left) { | |||
dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha); | |||
} | |||
sb += 2 * K; | |||
C += 2 * LDC; | |||
} | |||
if (N) { | |||
BLASLONG m_left = M; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
for (; m_left >= 4; m_left -= 4) { | |||
dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha); | |||
c_ += 4; | |||
a_ += 4 * K; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha); | |||
c_ += 2; | |||
a_ += 2 * K; | |||
} | |||
if (m_left) { | |||
dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha); | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,874 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ | |||
#define origM x0 | |||
#define origN x1 | |||
#define origK x2 | |||
#define origPA x3 | |||
#define origPB x4 | |||
#define pC x5 | |||
#define LDC x6 | |||
#define temp x7 | |||
#define counterL x8 | |||
#define counterI x9 | |||
#define counterJ x10 | |||
#define pB x11 | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define lanes x15 | |||
#define pA x16 | |||
#define alpha x17 | |||
#define alpha0 d10 | |||
#define alphaZ z2.d | |||
#define A_PRE_SIZE 1536 | |||
#define B_PRE_SIZE 512 | |||
#define C_PRE_SIZE 128 | |||
// 00 origM | |||
// 01 origN | |||
// 02 origK | |||
// 03 origPA | |||
// 04 origPB | |||
// 05 pC | |||
// 06 origLDC -> LDC | |||
// 07 temp | |||
// 08 counterL | |||
// 09 counterI | |||
// 10 counterJ | |||
// 11 pB | |||
// 12 pCRow0 | |||
// 13 pCRow1 | |||
// 14 pCRow2 | |||
// 15 lanes | |||
// 16 pA | |||
// 17 | |||
// 18 must save | |||
// 19 must save | |||
// 20 must save | |||
// 21 must save | |||
// 22 must save | |||
// 23 must save | |||
// 24 must save | |||
// 25 must save | |||
// 26 must save | |||
// 27 must save | |||
// 28 must save | |||
// 29 frame | |||
// 30 link | |||
// 31 sp | |||
//v00 ALPHA -> pA0_0 | |||
//v01 pA0_1 | |||
//v02 ALPHA0 | |||
//v03 | |||
//v04 | |||
//v05 | |||
//v06 | |||
//v07 | |||
//v08 must save pB0_0 | |||
//v09 must save pB0_1 | |||
//v10 must save pB0_2 | |||
//v11 must save pB0_3 | |||
//v12 must save pB0_4 | |||
//v13 must save pB0_5 | |||
//v14 must save pB0_6 | |||
//v15 must save pB0_7 | |||
//v16 must save C0 | |||
//v17 must save C1 | |||
//v18 must save C2 | |||
//v19 must save C3 | |||
//v20 must save C4 | |||
//v21 must save C5 | |||
//v22 must save C6 | |||
//v23 must save C7 | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
.macro INITv1x8 | |||
dup z16.d, #0 | |||
dup z17.d, #0 | |||
dup z18.d, #0 | |||
dup z19.d, #0 | |||
dup z20.d, #0 | |||
dup z21.d, #0 | |||
dup z22.d, #0 | |||
dup z23.d, #0 | |||
.endm | |||
.macro KERNELv1x8_I | |||
ld1d z0.d, p1/z, [pA] | |||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | |||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
ld1rd z8.d, p0/z, [pB] | |||
fmla z17.d, p1/m, z0.d, z9.d | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
fmla z18.d, p1/m, z0.d, z10.d | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
fmla z19.d, p1/m, z0.d, z11.d | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
fmla z20.d, p1/m, z0.d, z12.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
fmla z21.d, p1/m, z0.d, z13.d | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
fmla z22.d, p1/m, z0.d, z14.d | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
fmla z23.d, p1/m, z0.d, z15.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
.endm | |||
.macro KERNELv1x8_M1 | |||
ld1d z1.d, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
ld1rd z8.d, p0/z, [pB] | |||
fmla z17.d, p1/m, z0.d, z9.d | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
fmla z18.d, p1/m, z0.d, z10.d | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
fmla z19.d, p1/m, z0.d, z11.d | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
fmla z20.d, p1/m, z0.d, z12.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
fmla z21.d, p1/m, z0.d, z13.d | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
fmla z22.d, p1/m, z0.d, z14.d | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
fmla z23.d, p1/m, z0.d, z15.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
.endm | |||
.macro KERNELv1x8_M2 | |||
ld1d z0.d, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
fmla z16.d, p1/m, z1.d, z8.d | |||
ld1rd z8.d, p0/z, [pB] | |||
fmla z17.d, p1/m, z1.d, z9.d | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
fmla z18.d, p1/m, z1.d, z10.d | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
fmla z19.d, p1/m, z1.d, z11.d | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
fmla z20.d, p1/m, z1.d, z12.d | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla z21.d, p1/m, z1.d, z13.d | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
fmla z22.d, p1/m, z1.d, z14.d | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
fmla z23.d, p1/m, z1.d, z15.d | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
.endm | |||
.macro KERNELv1x8_E | |||
fmla z16.d, p1/m, z1.d, z8.d | |||
fmla z17.d, p1/m, z1.d, z9.d | |||
fmla z18.d, p1/m, z1.d, z10.d | |||
fmla z19.d, p1/m, z1.d, z11.d | |||
fmla z20.d, p1/m, z1.d, z12.d | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla z21.d, p1/m, z1.d, z13.d | |||
fmla z22.d, p1/m, z1.d, z14.d | |||
fmla z23.d, p1/m, z1.d, z15.d | |||
.endm | |||
.macro KERNELv1x8_SUB | |||
ld1d z0.d, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
ld1rd z12.d, p0/z, [pB, 32] | |||
ld1rd z13.d, p0/z, [pB, 40] | |||
ld1rd z14.d, p0/z, [pB, 48] | |||
ld1rd z15.d, p0/z, [pB, 56] | |||
add pB, pB, 64 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
fmla z17.d, p1/m, z0.d, z9.d | |||
fmla z18.d, p1/m, z0.d, z10.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla z19.d, p1/m, z0.d, z11.d | |||
fmla z20.d, p1/m, z0.d, z12.d | |||
fmla z21.d, p1/m, z0.d, z13.d | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla z22.d, p1/m, z0.d, z14.d | |||
fmla z23.d, p1/m, z0.d, z15.d | |||
.endm | |||
.macro SAVEv1x8 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
add pCRow1, pCRow0, LDC | |||
ld1d z24.d, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaZ | |||
st1d z24.d, p1, [pCRow0] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1d z25.d, p1/z, [pCRow1] | |||
fmla z25.d, p1/m, z17.d, alphaZ | |||
st1d z25.d, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1d z26.d, p1/z, [pCRow2] | |||
fmla z26.d, p1/m, z18.d, alphaZ | |||
st1d z26.d, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1d z27.d, p1/z, [pCRow1] | |||
fmla z27.d, p1/m, z19.d, alphaZ | |||
st1d z27.d, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1d z28.d, p1/z, [pCRow2] | |||
fmla z28.d, p1/m, z20.d, alphaZ | |||
st1d z28.d, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1d z29.d, p1/z, [pCRow1] | |||
fmla z29.d, p1/m, z21.d, alphaZ | |||
st1d z29.d, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1d z30.d, p1/z, [pCRow2] | |||
fmla z30.d, p1/m, z22.d, alphaZ | |||
st1d z30.d, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld1d z31.d, p1/z, [pCRow1] | |||
fmla z31.d, p1/m, z23.d, alphaZ | |||
st1d z31.d, p1, [pCRow1] | |||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x4 | |||
dup z16.d, #0 | |||
dup z17.d, #0 | |||
dup z18.d, #0 | |||
dup z19.d, #0 | |||
.endm | |||
.macro KERNELv1x4_SUB | |||
ld1d z0.d, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
ld1rd z10.d, p0/z, [pB, 16] | |||
ld1rd z11.d, p0/z, [pB, 24] | |||
add pB, pB, 32 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
fmla z17.d, p1/m, z0.d, z9.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla z18.d, p1/m, z0.d, z10.d | |||
fmla z19.d, p1/m, z0.d, z11.d | |||
.endm | |||
.macro SAVEv1x4 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
add pCRow1, pCRow0, LDC | |||
ld1d z24.d, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaZ | |||
st1d z24.d, p1, [pCRow0] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1d z25.d, p1/z, [pCRow1] | |||
fmla z25.d, p1/m, z17.d, alphaZ | |||
st1d z25.d, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1d z26.d, p1/z, [pCRow2] | |||
fmla z26.d, p1/m, z18.d, alphaZ | |||
st1d z26.d, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld1d z27.d, p1/z, [pCRow1] | |||
fmla z27.d, p1/m, z19.d, alphaZ | |||
st1d z27.d, p1, [pCRow1] | |||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x2 | |||
dup z16.d, #0 | |||
dup z17.d, #0 | |||
.endm | |||
.macro KERNELv1x2_SUB | |||
ld1d z0.d, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
ld1rd z9.d, p0/z, [pB, 8] | |||
add pB, pB, 16 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla z17.d, p1/m, z0.d, z9.d | |||
.endm | |||
.macro SAVEv1x2 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
add pCRow1, pCRow0, LDC | |||
ld1d z24.d, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaZ | |||
st1d z24.d, p1, [pCRow0] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld1d z25.d, p1/z, [pCRow1] | |||
fmla z25.d, p1/m, z17.d, alphaZ | |||
st1d z25.d, p1, [pCRow1] | |||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x1 | |||
dup z16.d, #0 | |||
.endm | |||
.macro KERNELv1x1_SUB | |||
ld1d z0.d, p1/z, [pA] | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 | |||
ld1rd z8.d, p0/z, [pB] | |||
add pB, pB, 8 | |||
fmla z16.d, p1/m, z0.d, z8.d | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
.endm | |||
.macro SAVEv1x1 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld1d z24.d, p1/z, [pCRow0] | |||
fmla z24.d, p1/m, z16.d, alphaZ | |||
st1d z24.d, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 | |||
.endm | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alpha, d0 | |||
dup alphaZ, alpha | |||
lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
ptrue p0.d // create true predicate | |||
mov pB, origPB | |||
// Loop over N | |||
mov counterJ, origN | |||
asr counterJ, counterJ, #3 // J = J / 8 | |||
cmp counterJ, #0 | |||
ble .Ldgemm_kernel_L4_BEGIN | |||
/******************************************************************************/ | |||
/* Repeat this as long as there are 8 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L8_BEGIN: | |||
mov pCRow0, pC | |||
add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L8_Mv1_BEGIN: | |||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM | |||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_20: | |||
mov pB, origPB | |||
INITv1x8 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #2 // is there at least 4 to do? | |||
blt .Ldgemm_kernel_L8_Mv1_32 | |||
KERNELv1x8_I | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
subs counterL, counterL, #2 // subtract 2 | |||
ble .Ldgemm_kernel_L8_Mv1_22a | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_22: | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L8_Mv1_22 | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_22a: | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_E | |||
b .Ldgemm_kernel_L8_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_32: | |||
tst counterL, #1 | |||
ble .Ldgemm_kernel_L8_Mv1_40 | |||
KERNELv1x8_I | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_E | |||
b .Ldgemm_kernel_L8_Mv1_44 | |||
.Ldgemm_kernel_L8_Mv1_40: | |||
INITv1x8 | |||
.Ldgemm_kernel_L8_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L8_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_46: | |||
KERNELv1x8_SUB | |||
subs counterL, counterL, #1 | |||
bne .Ldgemm_kernel_L8_Mv1_46 | |||
.Ldgemm_kernel_L8_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x8 | |||
.Ldgemm_kernel_L8_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
b.any .Ldgemm_kernel_L8_Mv1_20 | |||
.Ldgemm_kernel_L8_END: | |||
lsl temp, origK, #6 | |||
add origPB, origPB, temp // B = B + K * 8 * 8 | |||
subs counterJ, counterJ , #1 // j-- | |||
bgt .Ldgemm_kernel_L8_BEGIN | |||
/******************************************************************************/ | |||
/* Repeat the same thing if 4 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L4_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #4 | |||
ble .Ldgemm_kernel_L2_BEGIN | |||
mov pCRow0, pC | |||
add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L4_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
.align 5 | |||
.Ldgemm_kernel_L4_Mv1_20: | |||
mov pB, origPB | |||
INITv1x4 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #0 // is there at least 4 to do? | |||
ble .Ldgemm_kernel_L4_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L4_Mv1_22: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L4_Mv1_22 | |||
.Ldgemm_kernel_L4_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L4_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L4_Mv1_46: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
subs counterL, counterL, #1 | |||
bne .Ldgemm_kernel_L4_Mv1_46 | |||
.Ldgemm_kernel_L4_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x4 | |||
.Ldgemm_kernel_L4_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
b.any .Ldgemm_kernel_L4_Mv1_20 | |||
.Ldgemm_kernel_L4_END: | |||
lsl temp, origK, #5 | |||
add origPB, origPB, temp // B = B + K * 4 * 8 | |||
/******************************************************************************/ | |||
/* Repeat the same thing if 2 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L2_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #2 | |||
ble .Ldgemm_kernel_L1_BEGIN | |||
mov pCRow0, pC | |||
add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L2_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
.align 5 | |||
.Ldgemm_kernel_L2_Mv1_20: | |||
mov pB, origPB | |||
INITv1x2 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #0 // is there at least 4 to do? | |||
ble .Ldgemm_kernel_L2_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L2_Mv1_22: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L2_Mv1_22 | |||
.Ldgemm_kernel_L2_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L2_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L2_Mv1_46: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bne .Ldgemm_kernel_L2_Mv1_46 | |||
.Ldgemm_kernel_L2_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x2 | |||
.Ldgemm_kernel_L2_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
b.any .Ldgemm_kernel_L2_Mv1_20 | |||
.Ldgemm_kernel_L2_END: | |||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
/******************************************************************************/ | |||
/* Repeat the same thing if 1 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L1_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #1 | |||
ble .Ldgemm_kernel_L999 // done | |||
mov pCRow0, pC | |||
add pC, pC, LDC // add 1 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L1_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
.align 5 | |||
.Ldgemm_kernel_L1_Mv1_20: | |||
mov pB, origPB | |||
INITv1x1 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #0 // is there at least 8 to do? | |||
ble .Ldgemm_kernel_L1_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L1_Mv1_22: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L1_Mv1_22 | |||
.Ldgemm_kernel_L1_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L1_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L1_Mv1_46: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L1_Mv1_46 | |||
.Ldgemm_kernel_L1_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x1 | |||
.Ldgemm_kernel_L1_Mv1_END: | |||
incd counterI | |||
whilelt p1.d, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.d | |||
b.any .Ldgemm_kernel_L1_Mv1_20 | |||
.Ldgemm_kernel_L1_END: | |||
/******************************************************************************/ | |||
.Ldgemm_kernel_L999: | |||
mov x0, #0 // set return value | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,79 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
svint64_t lda_vec = svindex_s64(0LL, lda); | |||
uint64_t sve_size = svcntd(); | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint64_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
svst1_f64(pg, (double *) boffset, a_vec); | |||
aoffset1++; | |||
boffset += active; | |||
} | |||
aoffset += sve_size * lda; | |||
j += svcntd(); | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,77 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
uint64_t sve_size = svcntd(); | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint64_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||
svst1_f64(pg, (double *) boffset, a_vec); | |||
aoffset1 += lda; | |||
boffset += active; | |||
} | |||
aoffset += sve_size; | |||
j += svcntd(); | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,874 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
/* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ | |||
#define origM x0 | |||
#define origN x1 | |||
#define origK x2 | |||
#define origPA x3 | |||
#define origPB x4 | |||
#define pC x5 | |||
#define LDC x6 | |||
#define temp x7 | |||
#define counterL x8 | |||
#define counterI x9 | |||
#define counterJ x10 | |||
#define pB x11 | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define lanes x15 | |||
#define pA x16 | |||
#define alpha w17 | |||
#define alpha0 s10 | |||
#define alphaZ z2.s | |||
#define A_PRE_SIZE 1536 | |||
#define B_PRE_SIZE 512 | |||
#define C_PRE_SIZE 128 | |||
// 00 origM | |||
// 01 origN | |||
// 02 origK | |||
// 03 origPA | |||
// 04 origPB | |||
// 05 pC | |||
// 06 origLDC -> LDC | |||
// 07 temp | |||
// 08 counterL | |||
// 09 counterI | |||
// 10 counterJ | |||
// 11 pB | |||
// 12 pCRow0 | |||
// 13 pCRow1 | |||
// 14 pCRow2 | |||
// 15 lanes | |||
// 16 pA | |||
// 17 | |||
// 18 must save | |||
// 19 must save | |||
// 20 must save | |||
// 21 must save | |||
// 22 must save | |||
// 23 must save | |||
// 24 must save | |||
// 25 must save | |||
// 26 must save | |||
// 27 must save | |||
// 28 must save | |||
// 29 frame | |||
// 30 link | |||
// 31 sp | |||
//v00 ALPHA -> pA0_0 | |||
//v01 pA0_1 | |||
//v02 ALPHA0 | |||
//v03 | |||
//v04 | |||
//v05 | |||
//v06 | |||
//v07 | |||
//v08 must save pB0_0 | |||
//v09 must save pB0_1 | |||
//v10 must save pB0_2 | |||
//v11 must save pB0_3 | |||
//v12 must save pB0_4 | |||
//v13 must save pB0_5 | |||
//v14 must save pB0_6 | |||
//v15 must save pB0_7 | |||
//v16 must save C0 | |||
//v17 must save C1 | |||
//v18 must save C2 | |||
//v19 must save C3 | |||
//v20 must save C4 | |||
//v21 must save C5 | |||
//v22 must save C6 | |||
//v23 must save C7 | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
.macro INITv1x8 | |||
dup z16.s, #0 | |||
dup z17.s, #0 | |||
dup z18.s, #0 | |||
dup z19.s, #0 | |||
dup z20.s, #0 | |||
dup z21.s, #0 | |||
dup z22.s, #0 | |||
dup z23.s, #0 | |||
.endm | |||
.macro KERNELv1x8_I | |||
ld1w z0.s, p1/z, [pA] | |||
ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one | |||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
ld1rw z8.s, p0/z, [pB] | |||
fmla z17.s, p1/m, z0.s, z9.s | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
fmla z18.s, p1/m, z0.s, z10.s | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
fmla z19.s, p1/m, z0.s, z11.s | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
fmla z20.s, p1/m, z0.s, z12.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
fmla z21.s, p1/m, z0.s, z13.s | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
fmla z22.s, p1/m, z0.s, z14.s | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
fmla z23.s, p1/m, z0.s, z15.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
.endm | |||
.macro KERNELv1x8_M1 | |||
ld1w z1.s, p1/z, [pA] | |||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
ld1rw z8.s, p0/z, [pB] | |||
fmla z17.s, p1/m, z0.s, z9.s | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
fmla z18.s, p1/m, z0.s, z10.s | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
fmla z19.s, p1/m, z0.s, z11.s | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
fmla z20.s, p1/m, z0.s, z12.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
fmla z21.s, p1/m, z0.s, z13.s | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
fmla z22.s, p1/m, z0.s, z14.s | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
fmla z23.s, p1/m, z0.s, z15.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
.endm | |||
.macro KERNELv1x8_M2 | |||
ld1w z0.s, p1/z, [pA] | |||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
fmla z16.s, p1/m, z1.s, z8.s | |||
ld1rw z8.s, p0/z, [pB] | |||
fmla z17.s, p1/m, z1.s, z9.s | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
fmla z18.s, p1/m, z1.s, z10.s | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
fmla z19.s, p1/m, z1.s, z11.s | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
fmla z20.s, p1/m, z1.s, z12.s | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla z21.s, p1/m, z1.s, z13.s | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
fmla z22.s, p1/m, z1.s, z14.s | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
fmla z23.s, p1/m, z1.s, z15.s | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
.endm | |||
.macro KERNELv1x8_E | |||
fmla z16.s, p1/m, z1.s, z8.s | |||
fmla z17.s, p1/m, z1.s, z9.s | |||
fmla z18.s, p1/m, z1.s, z10.s | |||
fmla z19.s, p1/m, z1.s, z11.s | |||
fmla z20.s, p1/m, z1.s, z12.s | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla z21.s, p1/m, z1.s, z13.s | |||
fmla z22.s, p1/m, z1.s, z14.s | |||
fmla z23.s, p1/m, z1.s, z15.s | |||
.endm | |||
.macro KERNELv1x8_SUB | |||
ld1w z0.s, p1/z, [pA] | |||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
ld1rw z12.s, p0/z, [pB, 16] | |||
ld1rw z13.s, p0/z, [pB, 20] | |||
ld1rw z14.s, p0/z, [pB, 24] | |||
ld1rw z15.s, p0/z, [pB, 28] | |||
add pB, pB, 32 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
fmla z17.s, p1/m, z0.s, z9.s | |||
fmla z18.s, p1/m, z0.s, z10.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla z19.s, p1/m, z0.s, z11.s | |||
fmla z20.s, p1/m, z0.s, z12.s | |||
fmla z21.s, p1/m, z0.s, z13.s | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla z22.s, p1/m, z0.s, z14.s | |||
fmla z23.s, p1/m, z0.s, z15.s | |||
.endm | |||
.macro SAVEv1x8 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
add pCRow1, pCRow0, LDC | |||
ld1w z24.s, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaZ | |||
st1w z24.s, p1, [pCRow0] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1w z25.s, p1/z, [pCRow1] | |||
fmla z25.s, p1/m, z17.s, alphaZ | |||
st1w z25.s, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1w z26.s, p1/z, [pCRow2] | |||
fmla z26.s, p1/m, z18.s, alphaZ | |||
st1w z26.s, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1w z27.s, p1/z, [pCRow1] | |||
fmla z27.s, p1/m, z19.s, alphaZ | |||
st1w z27.s, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1w z28.s, p1/z, [pCRow2] | |||
fmla z28.s, p1/m, z20.s, alphaZ | |||
st1w z28.s, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1w z29.s, p1/z, [pCRow1] | |||
fmla z29.s, p1/m, z21.s, alphaZ | |||
st1w z29.s, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1w z30.s, p1/z, [pCRow2] | |||
fmla z30.s, p1/m, z22.s, alphaZ | |||
st1w z30.s, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld1w z31.s, p1/z, [pCRow1] | |||
fmla z31.s, p1/m, z23.s, alphaZ | |||
st1w z31.s, p1, [pCRow1] | |||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x4 | |||
dup z16.s, #0 | |||
dup z17.s, #0 | |||
dup z18.s, #0 | |||
dup z19.s, #0 | |||
.endm | |||
.macro KERNELv1x4_SUB | |||
ld1w z0.s, p1/z, [pA] | |||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
ld1rw z10.s, p0/z, [pB, 8] | |||
ld1rw z11.s, p0/z, [pB, 12] | |||
add pB, pB, 16 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
fmla z17.s, p1/m, z0.s, z9.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla z18.s, p1/m, z0.s, z10.s | |||
fmla z19.s, p1/m, z0.s, z11.s | |||
.endm | |||
.macro SAVEv1x4 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
add pCRow1, pCRow0, LDC | |||
ld1w z24.s, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaZ | |||
st1w z24.s, p1, [pCRow0] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow2, pCRow1, LDC | |||
ld1w z25.s, p1/z, [pCRow1] | |||
fmla z25.s, p1/m, z17.s, alphaZ | |||
st1w z25.s, p1, [pCRow1] | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow2, LDC | |||
ld1w z26.s, p1/z, [pCRow2] | |||
fmla z26.s, p1/m, z18.s, alphaZ | |||
st1w z26.s, p1, [pCRow2] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld1w z27.s, p1/z, [pCRow1] | |||
fmla z27.s, p1/m, z19.s, alphaZ | |||
st1w z27.s, p1, [pCRow1] | |||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x2 | |||
dup z16.s, #0 | |||
dup z17.s, #0 | |||
.endm | |||
.macro KERNELv1x2_SUB | |||
ld1w z0.s, p1/z, [pA] | |||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 | |||
ld1rw z8.s, p0/z, [pB] | |||
ld1rw z9.s, p0/z, [pB, 4] | |||
add pB, pB, 8 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla z17.s, p1/m, z0.s, z9.s | |||
.endm | |||
.macro SAVEv1x2 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
add pCRow1, pCRow0, LDC | |||
ld1w z24.s, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaZ | |||
st1w z24.s, p1, [pCRow0] | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
ld1w z25.s, p1/z, [pCRow1] | |||
fmla z25.s, p1/m, z17.s, alphaZ | |||
st1w z25.s, p1, [pCRow1] | |||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
.endm | |||
/******************************************************************************/ | |||
.macro INITv1x1 | |||
dup z16.s, #0 | |||
.endm | |||
.macro KERNELv1x1_SUB | |||
ld1w z0.s, p1/z, [pA] | |||
add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8 | |||
ld1rw z8.s, p0/z, [pB] | |||
add pB, pB, 4 | |||
fmla z16.s, p1/m, z0.s, z8.s | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
.endm | |||
.macro SAVEv1x1 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld1w z24.s, p1/z, [pCRow0] | |||
fmla z24.s, p1/m, z16.s, alphaZ | |||
st1w z24.s, p1, [pCRow0] | |||
add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 | |||
.endm | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alpha, s0 | |||
dup alphaZ, alpha | |||
lsl LDC, LDC, #2 // ldc = ldc * 4 | |||
ptrue p0.s // create true predicate | |||
mov pB, origPB | |||
// Loop over N | |||
mov counterJ, origN | |||
asr counterJ, counterJ, #3 // J = J / 8 | |||
cmp counterJ, #0 | |||
ble .Ldgemm_kernel_L4_BEGIN | |||
/******************************************************************************/ | |||
/* Repeat this as long as there are 8 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L8_BEGIN: | |||
mov pCRow0, pC | |||
add pC, pC, LDC, lsl #3 // add 8 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L8_Mv1_BEGIN: | |||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM | |||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_20: | |||
mov pB, origPB | |||
INITv1x8 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #2 // is there at least 4 to do? | |||
blt .Ldgemm_kernel_L8_Mv1_32 | |||
KERNELv1x8_I | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
subs counterL, counterL, #2 // subtract 2 | |||
ble .Ldgemm_kernel_L8_Mv1_22a | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_22: | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L8_Mv1_22 | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_22a: | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_E | |||
b .Ldgemm_kernel_L8_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_32: | |||
tst counterL, #1 | |||
ble .Ldgemm_kernel_L8_Mv1_40 | |||
KERNELv1x8_I | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_M2 | |||
KERNELv1x8_M1 | |||
KERNELv1x8_E | |||
b .Ldgemm_kernel_L8_Mv1_44 | |||
.Ldgemm_kernel_L8_Mv1_40: | |||
INITv1x8 | |||
.Ldgemm_kernel_L8_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L8_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L8_Mv1_46: | |||
KERNELv1x8_SUB | |||
subs counterL, counterL, #1 | |||
bne .Ldgemm_kernel_L8_Mv1_46 | |||
.Ldgemm_kernel_L8_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x8 | |||
.Ldgemm_kernel_L8_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
b.any .Ldgemm_kernel_L8_Mv1_20 | |||
.Ldgemm_kernel_L8_END: | |||
lsl temp, origK, #5 | |||
add origPB, origPB, temp // B = B + K * 8 * 4 | |||
subs counterJ, counterJ , #1 // j-- | |||
bgt .Ldgemm_kernel_L8_BEGIN | |||
/******************************************************************************/ | |||
/* Repeat the same thing if 4 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L4_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #4 | |||
ble .Ldgemm_kernel_L2_BEGIN | |||
mov pCRow0, pC | |||
add pC, pC, LDC, lsl #2 // add 4 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L4_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
.align 5 | |||
.Ldgemm_kernel_L4_Mv1_20: | |||
mov pB, origPB | |||
INITv1x4 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #0 // is there at least 4 to do? | |||
ble .Ldgemm_kernel_L4_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L4_Mv1_22: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
KERNELv1x4_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L4_Mv1_22 | |||
.Ldgemm_kernel_L4_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L4_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L4_Mv1_46: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x4_SUB | |||
subs counterL, counterL, #1 | |||
bne .Ldgemm_kernel_L4_Mv1_46 | |||
.Ldgemm_kernel_L4_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x4 | |||
.Ldgemm_kernel_L4_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
b.any .Ldgemm_kernel_L4_Mv1_20 | |||
.Ldgemm_kernel_L4_END: | |||
lsl temp, origK, #4 | |||
add origPB, origPB, temp // B = B + K * 4 * 4 | |||
/******************************************************************************/ | |||
/* Repeat the same thing if 2 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L2_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #2 | |||
ble .Ldgemm_kernel_L1_BEGIN | |||
mov pCRow0, pC | |||
add pC, pC, LDC, lsl #1 // add 2 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L2_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
.align 5 | |||
.Ldgemm_kernel_L2_Mv1_20: | |||
mov pB, origPB | |||
INITv1x2 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #0 // is there at least 4 to do? | |||
ble .Ldgemm_kernel_L2_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L2_Mv1_22: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L2_Mv1_22 | |||
.Ldgemm_kernel_L2_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L2_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L2_Mv1_46: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x2_SUB | |||
subs counterL, counterL, #1 | |||
bne .Ldgemm_kernel_L2_Mv1_46 | |||
.Ldgemm_kernel_L2_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x2 | |||
.Ldgemm_kernel_L2_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
b.any .Ldgemm_kernel_L2_Mv1_20 | |||
.Ldgemm_kernel_L2_END: | |||
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 | |||
/******************************************************************************/ | |||
/* Repeat the same thing if 1 left in N */ | |||
.align 5 | |||
.Ldgemm_kernel_L1_BEGIN: | |||
mov counterJ , origN | |||
tst counterJ , #1 | |||
ble .Ldgemm_kernel_L999 // done | |||
mov pCRow0, pC | |||
add pC, pC, LDC // add 1 x LDC | |||
mov pA, origPA // pA = start of A array | |||
.Ldgemm_kernel_L1_Mv1_BEGIN: | |||
mov counterI, #0 | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
.align 5 | |||
.Ldgemm_kernel_L1_Mv1_20: | |||
mov pB, origPB | |||
INITv1x1 // fill with zeros | |||
asr counterL , origK, #3 // L = K / 8 | |||
cmp counterL , #0 // is there at least 8 to do? | |||
ble .Ldgemm_kernel_L1_Mv1_44 | |||
.align 5 | |||
.Ldgemm_kernel_L1_Mv1_22: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L1_Mv1_22 | |||
.Ldgemm_kernel_L1_Mv1_44: | |||
ands counterL , origK, #7 | |||
ble .Ldgemm_kernel_L1_Mv1_100 | |||
.align 5 | |||
.Ldgemm_kernel_L1_Mv1_46: | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
KERNELv1x1_SUB | |||
subs counterL, counterL, #1 | |||
bgt .Ldgemm_kernel_L1_Mv1_46 | |||
.Ldgemm_kernel_L1_Mv1_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVEv1x1 | |||
.Ldgemm_kernel_L1_Mv1_END: | |||
incw counterI | |||
whilelt p1.s, counterI, origM //SVE instruction | |||
cntp lanes, p0, p1.s | |||
b.any .Ldgemm_kernel_L1_Mv1_20 | |||
.Ldgemm_kernel_L1_END: | |||
/******************************************************************************/ | |||
.Ldgemm_kernel_L999: | |||
mov x0, #0 // set return value | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,78 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
svint32_t lda_vec = svindex_s32(0LL, lda); | |||
uint32_t sve_size = svcntw(); | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b32(j, n); | |||
uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint32_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | |||
svst1_f32(pg, (float *) boffset, a_vec); | |||
aoffset1++; | |||
boffset += active; | |||
} | |||
aoffset += sve_size * lda; | |||
j += svcntw(); | |||
pg = svwhilelt_b32(j, n); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,77 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
// TODO: write in assembly with proper unrolling of inner loop | |||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
BLASLONG j; | |||
IFLOAT *aoffset, *aoffset1, *boffset; | |||
uint32_t sve_size = svcntw(); | |||
aoffset = a; | |||
boffset = b; | |||
j = 0; | |||
svbool_t pg = svwhilelt_b32(j, n); | |||
uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
do { | |||
aoffset1 = aoffset; | |||
uint32_t i_cnt = m; | |||
while (i_cnt--) { | |||
svfloat32_t a_vec = svld1(pg, (float *) aoffset1); | |||
svst1_f32(pg, (float *) boffset, a_vec); | |||
aoffset1 += lda; | |||
boffset += active; | |||
} | |||
aoffset += sve_size; | |||
j += svcntw(); | |||
pg = svwhilelt_b32(j, n); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
return 0; | |||
} |
@@ -0,0 +1,143 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, offset; | |||
#if defined(DOUBLE) | |||
uint64_t sve_size = svcntd(); | |||
svint64_t posY_vec = svdup_s64(posY); | |||
svint64_t posX_vec = svdup_s64(posX); | |||
svint64_t lda_vec = svdup_s64(lda); | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
do { | |||
offset = posX - posY; | |||
svint64_t vec_off = svdup_s64(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint64_t temp = svadd_z(pg, posX_vec, index); | |||
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); | |||
svst1(pg, b, data_vec); | |||
b += active; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
uint32_t sve_size = svcntw(); | |||
svint32_t posY_vec = svdup_s32(posY); | |||
svint32_t posX_vec = svdup_s32(posX); | |||
svint32_t lda_vec = svdup_s32(lda); | |||
svint32_t one_vec = svdup_s32(1); | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
do { | |||
offset = posX - posY; | |||
svint32_t vec_off = svdup_s32(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint32_t temp = svadd_z(pg, posX_vec, index); | |||
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
i = m; | |||
while (i>0) { | |||
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); | |||
svst1(pg, b, data_vec); | |||
b += active; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,143 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, offset; | |||
#if defined(DOUBLE) | |||
uint64_t sve_size = svcntd(); | |||
svint64_t posY_vec = svdup_s64(posY); | |||
svint64_t posX_vec = svdup_s64(posX); | |||
svint64_t lda_vec = svdup_s64(lda); | |||
svint64_t one_vec = svdup_s64(1LL); | |||
int64_t j = 0; | |||
svbool_t pg = svwhilelt_b64(j, n); | |||
int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
svint64_t index = svindex_s64(0LL, 1LL); | |||
do { | |||
offset = posX - posY; | |||
svint64_t vec_off = svdup_s64(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint64_t temp = svadd_z(pg, posX_vec, index); | |||
svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
svint64_t gat_ind = svsel(cmp, temp2, temp1); | |||
i = m; | |||
while (i>0) { | |||
svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, one_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
svst1(pg, b, data_vec); | |||
b += active; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s64(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b64(j, n); | |||
active = svcntp_b64(svptrue_b64(), pg); | |||
} while (svptest_any(svptrue_b64(), pg)); | |||
#else | |||
uint32_t sve_size = svcntw(); | |||
svint32_t posY_vec = svdup_s32(posY); | |||
svint32_t posX_vec = svdup_s32(posX); | |||
svint32_t lda_vec = svdup_s32(lda); | |||
svint32_t one_vec = svdup_s32(1); | |||
int32_t N = n; | |||
int32_t j = 0; | |||
svbool_t pg = svwhilelt_b32(j, N); | |||
int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
svint32_t index_neg = svindex_s32(0, -1); | |||
svint32_t index = svindex_s32(0, 1); | |||
do { | |||
offset = posX - posY; | |||
svint32_t vec_off = svdup_s32(offset); | |||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
svint32_t temp = svadd_z(pg, posX_vec, index); | |||
svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); | |||
svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); | |||
svint32_t gat_ind = svsel(cmp, temp2, temp1); | |||
i = m; | |||
while (i>0) { | |||
svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); | |||
gat_ind = svadd_m(cmp, gat_ind, one_vec); | |||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
svst1(pg, b, data_vec); | |||
b += active; | |||
offset --; | |||
vec_off = svsub_z(pg, vec_off, one_vec); | |||
cmp = svcmpgt(pg, vec_off, index_neg); | |||
i--; | |||
} | |||
posX += sve_size; | |||
posX_vec = svdup_s32(posX); | |||
j += sve_size; | |||
pg = svwhilelt_b32(j, N); | |||
active = svcntp_b32(svptrue_b32(), pg); | |||
} while (svptest_any(svptrue_b32(), pg)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,136 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
js = 0; | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posY + posX * lda; | |||
} else { | |||
ao = a + posX + posY * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X > posY) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#else | |||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
ao ++; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X < posY) { | |||
ao += lda; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = *(ao+k*lda+j); | |||
} | |||
b[temp++] = ONE; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k <= j; k++) { | |||
b[temp++] = *(ao+k*lda+j); | |||
} | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#endif | |||
ao += n_active; | |||
b += n_active*n_active; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,136 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posY + posX * lda; | |||
} else { | |||
ao = a + posX + posY * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X > posY) { | |||
ao ++; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X < posY) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1(pn, ao); | |||
#else | |||
svfloat32_t aj_vec = svld1(pn, ao); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
ao += lda; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
b[temp++] = ONE; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = *(ao+j*lda+k); | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
for (int k = j; k < n_active; k++) { | |||
b[temp++] = *(ao+j*lda+k); | |||
} | |||
} | |||
#endif | |||
ao += n_active * lda; | |||
b += n_active*n_active; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,136 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
js = 0; | |||
FLOAT *ao; | |||
#ifdef DOUBLE | |||
svint64_t index = svindex_s64(0LL, lda); | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svint32_t index = svindex_s32(0, lda); | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posX + posY * lda; | |||
} else { | |||
ao = a + posY + posX * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X < posY) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#else | |||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
ao ++; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X > posY) { | |||
ao += lda; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
b[temp++] = ONE; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = *(ao+k*lda+j); | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
for (int k = j; k < n_active; k++) { | |||
b[temp++] = *(ao+k*lda+j); | |||
} | |||
} | |||
#endif | |||
ao += n_active; | |||
b += n_active*n_active; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,134 @@ | |||
/*********************************************************************/ | |||
/* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
/* All rights reserved. */ | |||
/* */ | |||
/* Redistribution and use in source and binary forms, with or */ | |||
/* without modification, are permitted provided that the following */ | |||
/* conditions are met: */ | |||
/* */ | |||
/* 1. Redistributions of source code must retain the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer. */ | |||
/* */ | |||
/* 2. Redistributions in binary form must reproduce the above */ | |||
/* copyright notice, this list of conditions and the following */ | |||
/* disclaimer in the documentation and/or other materials */ | |||
/* provided with the distribution. */ | |||
/* */ | |||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
/* POSSIBILITY OF SUCH DAMAGE. */ | |||
/* */ | |||
/* The views and conclusions contained in the software and */ | |||
/* documentation are those of the authors and should not be */ | |||
/* interpreted as representing official policies, either expressed */ | |||
/* or implied, of The University of Texas at Austin. */ | |||
/*********************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#ifdef __ARM_FEATURE_SVE | |||
#include <arm_sve.h> | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
BLASLONG i, js; | |||
BLASLONG X; | |||
FLOAT *ao; | |||
js = 0; | |||
#ifdef DOUBLE | |||
svbool_t pn = svwhilelt_b64(js, n); | |||
int n_active = svcntp_b64(svptrue_b64(), pn); | |||
#else | |||
svbool_t pn = svwhilelt_b32(js, n); | |||
int n_active = svcntp_b32(svptrue_b32(), pn); | |||
#endif | |||
do | |||
{ | |||
X = posX; | |||
if (posX <= posY) { | |||
ao = a + posX + posY * lda; | |||
} else { | |||
ao = a + posY + posX * lda; | |||
} | |||
i = 0; | |||
do | |||
{ | |||
if (X < posY) { | |||
ao ++; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else | |||
if (X > posY) { | |||
#ifdef DOUBLE | |||
svfloat64_t aj_vec = svld1(pn, ao); | |||
#else | |||
svfloat32_t aj_vec = svld1(pn, ao); | |||
#endif | |||
svst1(pn, b, aj_vec); | |||
ao += lda; | |||
b += n_active; | |||
X ++; | |||
i ++; | |||
} else { | |||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
#ifdef UNIT | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k < j; k++) { | |||
b[temp++] = *(ao+j*lda+k); | |||
} | |||
b[temp++] = ONE; | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#else | |||
int temp = 0; | |||
for (int j = 0; j < n_active; j++) { | |||
for (int k = 0 ; k <= j; k++) { | |||
b[temp++] = *(ao+j*lda+k); | |||
} | |||
for (int k = j+1; k < n_active; k++) { | |||
b[temp++] = ZERO; | |||
} | |||
} | |||
#endif | |||
ao += n_active * lda; | |||
b += n_active*n_active; | |||
X += n_active; | |||
i += n_active; | |||
} | |||
} while (i < m); | |||
posY += n_active; | |||
js += n_active; | |||
#ifdef DOUBLE | |||
pn = svwhilelt_b64(js, n); | |||
n_active = svcntp_b64(svptrue_b64(), pn); | |||
} while (svptest_any(svptrue_b64(), pn)); | |||
#else | |||
pn = svwhilelt_b32(js, n); | |||
n_active = svcntp_b32(svptrue_b32(), pn); | |||
} while (svptest_any(svptrue_b32(), pn)); | |||
#endif | |||
return 0; | |||
} |
@@ -0,0 +1,736 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
/******************************************************************************* | |||
The complex GEMM kernels in OpenBLAS use static configuration of conjugation | |||
modes via specific macros: | |||
MACRO_NAME | conjugation on matrix A | conjugation on matrix B | | |||
---------- | ----------------------- | ----------------------- | | |||
NN/NT/TN/TT | No | No | | |||
NR/NC/TR/TC | No | Yes | | |||
RN/RT/CN/CT | Yes | No | | |||
RR/RC/CR/CC | Yes | Yes | | |||
"conjugation on matrix A" means the complex conjugates of elements from | |||
matrix A are used for matmul (rather than the original elements). "conjugation | |||
on matrix B" means the complex conjugate of each element from matrix B is taken | |||
for matrix multiplication, respectively. | |||
Complex numbers in arrays or matrices are usually packed together as an | |||
array of struct (without padding): | |||
struct complex_number { | |||
FLOAT real_part; | |||
FLOAT imag_part; | |||
}; | |||
For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF | |||
DOUBLE, the real part of its Kth complex number can be accessed as | |||
ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1]. | |||
This file uses 2 ways to vectorize matrix multiplication of complex numbers: | |||
(1) Expanded-form | |||
During accumulation along direction K: | |||
Σk(a[0][k].real b[k][n].real) | |||
accumulate Σk(a[0][k].imag b[k][n].real) | |||
-------------------> . | |||
| * b[k][n].real . | |||
| (broadcasted) . | |||
a[0][k].real Σk(a[v-1][k].real b[k][n].real) | |||
a[0][k].imag Σk(a[v-1][k].imag b[k][n].real) | |||
. VECTOR I | |||
(vec_a) . | |||
. | |||
a[v-1][k].real Σk(a[0][k].real b[k][n].imag) | |||
a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag) | |||
| . | |||
| accumulate . | |||
-------------------> . | |||
* b[k][n].imag Σk(a[v-1][k].real b[k][n].imag) | |||
(broadcasted) Σk(a[v-1][k].imag b[k][n].imag) | |||
VECTOR II | |||
After accumulation, prior to storage: | |||
-1 -Σk(a[0][k].imag b[k][n].imag) | |||
1 Σk(a[0][k].real b[k][n].imag) | |||
. . | |||
VECTOR II permute and multiply . to get . | |||
. . | |||
-1 -Σk(a[v-1][k].imag b[k][n].imag) | |||
1 Σk(a[v-1][k].real b[k][n].imag) | |||
then add with VECTOR I to get the result vector of elements of C. | |||
2 vector registers are needed for every v elements of C, with | |||
v == sizeof(vector) / sizeof(complex) | |||
(2) Contracted-form | |||
During accumulation along direction K: | |||
(the K coordinate is not shown, since the operation is identical for each k) | |||
(load vector in mem) (load vector in mem) | |||
a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i | |||
| | | |||
| unzip operation (or VLD2 in arm neon) | | |||
----------------------------------------------------- | |||
| | |||
| | |||
-------------------------------------------------- | |||
| | | |||
| | | |||
v v | |||
a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag | |||
| | | | | |||
| | * b[i].imag(broadcast) | | | |||
* b[i].real | -----------------------------|---- | * b[i].real | |||
(broadcast) | | | | (broadcast) | |||
| ------------------------------ | | | |||
+ | - | * b[i].imag(broadcast) + | + | | |||
v v v v | |||
(accumulate) (accumulate) | |||
c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag | |||
VECTOR_REAL VECTOR_IMAG | |||
After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved) | |||
then stored to matrix C directly. | |||
For 2v elements of C, only 2 vector registers are needed, while | |||
4 registers are required for expanded-form. | |||
(v == sizeof(vector) / sizeof(complex)) | |||
For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers | |||
to store elements of C when using expanded-form calculation, where | |||
the register spilling will occur. So contracted-form operation is | |||
selected for 4x4 kernel. As for all other combinations of unroll parameters | |||
(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more | |||
NEON registers into usage to hide latency of multiply-add instructions. | |||
******************************************************************************/ | |||
static inline float64x2_t set_f64x2(double lo, double hi) { | |||
float64x2_t ret = vdupq_n_f64(0); | |||
ret = vsetq_lane_f64(lo, ret, 0); | |||
ret = vsetq_lane_f64(hi, ret, 1); | |||
return ret; | |||
} | |||
static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) { | |||
float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }}; | |||
return ret; | |||
} | |||
/***************************************************************** | |||
* operation: *c += alpha * c_value //complex multiplication | |||
* expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r } | |||
* expanded_c: {{ arbr, aibr }, { arbi, aibi }} | |||
****************************************************************/ | |||
static inline void store_1c(double *c, float64x2x2_t expanded_c, | |||
float64x2x2_t expanded_alpha) { | |||
float64x2_t ld = vld1q_f64(c); | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); | |||
double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); | |||
double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); | |||
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); | |||
#else | |||
double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); | |||
double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); | |||
#endif | |||
ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real); | |||
vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag)); | |||
} | |||
static inline void pref_c_4(const double *c) { | |||
__asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):); | |||
} | |||
static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) { | |||
float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]), | |||
vaddq_f64(ec1.val[1], ec2.val[1]) }}; | |||
return ret; | |||
} | |||
static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) { | |||
float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }}; | |||
return ret; | |||
} | |||
static inline float64x2x2_t init() { | |||
float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }}; | |||
return ret; | |||
} | |||
static inline void kernel_1x1(const double *sa, const double *sb, double *C, | |||
BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init(); | |||
for (; K > 3; K -= 4) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), | |||
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), | |||
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; | |||
c1 = update_ec(c1, a1, b1); | |||
c2 = update_ec(c2, a2, b2); | |||
c3 = update_ec(c3, a3, b3); | |||
c4 = update_ec(c4, a4, b4); | |||
} | |||
c1 = add_ec(c1, c2); | |||
c3 = add_ec(c3, c4); | |||
c1 = add_ec(c1, c3); | |||
for (; K; K--) { | |||
c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2; | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
} | |||
static inline void kernel_2x1(const double *sa, const double *sb, double *C, | |||
BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init(); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), | |||
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
c1 = update_ec(c1, a1, b1); | |||
c2 = update_ec(c2, a2, b1); | |||
c3 = update_ec(c3, a3, b2); | |||
c4 = update_ec(c4, a4, b2); | |||
} | |||
c1 = add_ec(c1, c3); | |||
c2 = add_ec(c2, c4); | |||
if (K) { | |||
float64x2_t b1 = vld1q_f64(sb); | |||
c1 = update_ec(c1, vld1q_f64(sa), b1); | |||
c2 = update_ec(c2, vld1q_f64(sa + 2), b1); | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
store_1c(C + 2, c2, expanded_alpha); | |||
} | |||
static inline void kernel_1x2(const double *sa, const double *sb, double *C, | |||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init(); | |||
for (; K > 1; K -= 2) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), | |||
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; | |||
c1 = update_ec(c1, a1, b1); | |||
c2 = update_ec(c2, a1, b2); | |||
c3 = update_ec(c3, a2, b3); | |||
c4 = update_ec(c4, a2, b4); | |||
} | |||
c1 = add_ec(c1, c3); | |||
c2 = add_ec(c2, c4); | |||
if (K) { | |||
float64x2_t a1 = vld1q_f64(sa); | |||
c1 = update_ec(c1, a1, vld1q_f64(sb)); | |||
c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
store_1c(C + LDC * 2, c2, expanded_alpha); | |||
} | |||
static inline void kernel_2x2(const double *sa, const double *sb, double *C, | |||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init(); | |||
for (; K; K--) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
c1 = update_ec(c1, a1, b1); | |||
c2 = update_ec(c2, a2, b1); | |||
c3 = update_ec(c3, a1, b2); | |||
c4 = update_ec(c4, a2, b2); | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c3, expanded_alpha); | |||
store_1c(C + 2, c4, expanded_alpha); | |||
} | |||
static inline void kernel_4x1(const double *sa, const double *sb, double *C, | |||
BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init(); | |||
pref_c_4(C); | |||
for (; K; K--) { | |||
float64x2_t b1 = vld1q_f64(sb); sb += 2; | |||
c1 = update_ec(c1, vld1q_f64(sa), b1); | |||
c2 = update_ec(c2, vld1q_f64(sa + 2), b1); | |||
c3 = update_ec(c3, vld1q_f64(sa + 4), b1); | |||
c4 = update_ec(c4, vld1q_f64(sa + 6), b1); | |||
sa += 8; | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
store_1c(C + 2, c2, expanded_alpha); | |||
store_1c(C + 4, c3, expanded_alpha); | |||
store_1c(C + 6, c4, expanded_alpha); | |||
} | |||
static inline void kernel_4x2(const double *sa, const double *sb, double *C, | |||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; | |||
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); | |||
pref_c_4(C); | |||
pref_c_4(C + LDC * 2); | |||
for (; K; K--) { | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), | |||
a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; | |||
c1 = update_ec(c1, a1, b1); | |||
c2 = update_ec(c2, a2, b1); | |||
c3 = update_ec(c3, a3, b1); | |||
c4 = update_ec(c4, a4, b1); | |||
c5 = update_ec(c5, a1, b2); | |||
c6 = update_ec(c6, a2, b2); | |||
c7 = update_ec(c7, a3, b2); | |||
c8 = update_ec(c8, a4, b2); | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
store_1c(C + 2, c2, expanded_alpha); | |||
store_1c(C + 4, c3, expanded_alpha); | |||
store_1c(C + 6, c4, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c5, expanded_alpha); | |||
store_1c(C + 2, c6, expanded_alpha); | |||
store_1c(C + 4, c7, expanded_alpha); | |||
store_1c(C + 6, c8, expanded_alpha); | |||
} | |||
static inline void kernel_1x4(const double *sa, const double *sb, double *C, | |||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4; | |||
c1 = c2 = c3 = c4 = init(); | |||
for (; K; K--) { | |||
float64x2_t a1 = vld1q_f64(sa); sa += 2; | |||
c1 = update_ec(c1, a1, vld1q_f64(sb)); | |||
c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); | |||
c3 = update_ec(c3, a1, vld1q_f64(sb + 4)); | |||
c4 = update_ec(c4, a1, vld1q_f64(sb + 6)); | |||
sb += 8; | |||
} | |||
store_1c(C, c1, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c2, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c3, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c4, expanded_alpha); | |||
} | |||
static inline void kernel_2x4(const double *sa, const double *sb, double *C, | |||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); | |||
float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; | |||
c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); | |||
for (; K; K--) { | |||
float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; | |||
float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), | |||
b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; | |||
c1 = update_ec(c1, a1, b1); | |||
c2 = update_ec(c2, a2, b1); | |||
c3 = update_ec(c3, a1, b2); | |||
c4 = update_ec(c4, a2, b2); | |||
c5 = update_ec(c5, a1, b3); | |||
c6 = update_ec(c6, a2, b3); | |||
c7 = update_ec(c7, a1, b4); | |||
c8 = update_ec(c8, a2, b4); | |||
} | |||
store_1c(C, c1, expanded_alpha); | |||
store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c3, expanded_alpha); | |||
store_1c(C + 2, c4, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c5, expanded_alpha); | |||
store_1c(C + 2, c6, expanded_alpha); C += LDC * 2; | |||
store_1c(C, c7, expanded_alpha); | |||
store_1c(C + 2, c8, expanded_alpha); | |||
} | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
#define FMLA_RI "fmla " | |||
#define FMLA_IR "fmla " | |||
#define FMLA_II "fmls " | |||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
#define FMLA_RI "fmls " | |||
#define FMLA_IR "fmla " | |||
#define FMLA_II "fmla " | |||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
#define FMLA_RI "fmla " | |||
#define FMLA_IR "fmls " | |||
#define FMLA_II "fmla " | |||
#else | |||
#define FMLA_RI "fmls " | |||
#define FMLA_IR "fmls " | |||
#define FMLA_II "fmls " | |||
#endif | |||
#define FMLA_RR "fmla " | |||
static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i, | |||
float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) { | |||
float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4); | |||
up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar); | |||
up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai); | |||
lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar); | |||
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai); | |||
up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai); | |||
up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar); | |||
lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai); | |||
lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar); | |||
vst2q_f64(C, up); | |||
vst2q_f64(C + 4, lo); | |||
} | |||
static inline void kernel_4x4(const double *sa, const double *sb, double *C, | |||
BLASLONG LDC, BLASLONG K, double alphar, double alphai) { | |||
float64x2_t c1r, c1i, c2r, c2i; | |||
float64x2_t c3r, c3i, c4r, c4i; | |||
float64x2_t c5r, c5i, c6r, c6i; | |||
float64x2_t c7r, c7i, c8r, c8i; | |||
const double *pref_ = C; | |||
pref_c_4(pref_); pref_ += LDC * 2; | |||
pref_c_4(pref_); pref_ += LDC * 2; | |||
pref_c_4(pref_); pref_ += LDC * 2; | |||
pref_c_4(pref_); | |||
__asm__ __volatile__( | |||
"cmp %[K],#0\n\t" | |||
"movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t" | |||
"movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t" | |||
"movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t" | |||
"movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" | |||
"beq 4f; cmp %[K],#2\n\t" | |||
"ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t" | |||
"ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t" | |||
"ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t" | |||
"beq 2f; blt 3f\n\t" | |||
"1:\n\t" | |||
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" | |||
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" | |||
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" | |||
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" | |||
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" | |||
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" | |||
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" | |||
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" | |||
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" | |||
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" | |||
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" | |||
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" | |||
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" | |||
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" | |||
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" | |||
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" | |||
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" | |||
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" | |||
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" | |||
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" | |||
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" | |||
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" | |||
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" | |||
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" | |||
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" | |||
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" | |||
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" | |||
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" | |||
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" | |||
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" | |||
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" | |||
"fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t" | |||
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t" | |||
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" | |||
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" | |||
"fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t" | |||
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t" | |||
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" | |||
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" | |||
"fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t" | |||
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t" | |||
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" | |||
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" | |||
"fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t" | |||
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t" | |||
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" | |||
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" | |||
"fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t" | |||
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t" | |||
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" | |||
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" | |||
"fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t" | |||
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t" | |||
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" | |||
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" | |||
"fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t" | |||
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t" | |||
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" | |||
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" | |||
"fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t" | |||
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t" | |||
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" | |||
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t" | |||
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" | |||
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t" | |||
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" | |||
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t" | |||
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" | |||
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t" | |||
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" | |||
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t" | |||
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" | |||
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t" | |||
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" | |||
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" | |||
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" | |||
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t" | |||
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" | |||
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t" | |||
"2:\n\t" | |||
"fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" | |||
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" | |||
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" | |||
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" | |||
"fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" | |||
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" | |||
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" | |||
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" | |||
"fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" | |||
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" | |||
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" | |||
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" | |||
"fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" | |||
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" | |||
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" | |||
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" | |||
"fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" | |||
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" | |||
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" | |||
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" | |||
"fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" | |||
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" | |||
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" | |||
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" | |||
"fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" | |||
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" | |||
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" | |||
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" | |||
"fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" | |||
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" | |||
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" | |||
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" | |||
"fmov v15.d[1],x0\n\t" | |||
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" | |||
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" | |||
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" | |||
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" | |||
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" | |||
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" | |||
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" | |||
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" | |||
FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" | |||
FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t" | |||
FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" | |||
FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" | |||
FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t" | |||
FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" | |||
FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" | |||
FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t" | |||
FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" | |||
FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" | |||
FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t" | |||
FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" | |||
FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" | |||
FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t" | |||
FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" | |||
FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t" | |||
FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" | |||
FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t" | |||
FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" | |||
FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t" | |||
FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" | |||
FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t" | |||
FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" | |||
FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t" | |||
FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" | |||
FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t" | |||
FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" | |||
FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" | |||
FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" | |||
FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t" | |||
FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" | |||
FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t" | |||
"3:\n\t" | |||
"fmov v7.d[1],x0\n\t" | |||
FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t" | |||
FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" | |||
FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" | |||
FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t" | |||
FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" | |||
FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" | |||
FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t" | |||
FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" | |||
FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" | |||
FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t" | |||
FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" | |||
FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" | |||
FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t" | |||
FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" | |||
FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" | |||
FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t" | |||
FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" | |||
FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" | |||
FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t" | |||
FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" | |||
FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" | |||
FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t" | |||
FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" | |||
FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" | |||
FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" | |||
FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" | |||
FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" | |||
FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" | |||
FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" | |||
FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" | |||
FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" | |||
FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t" | |||
"4:\n\t" | |||
:[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), | |||
[c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), | |||
[c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), | |||
[c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), | |||
[K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb) | |||
::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | |||
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); | |||
store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2; | |||
store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2; | |||
store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2; | |||
store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai); | |||
} | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, | |||
FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { | |||
BLASLONG n_left = N; | |||
for (; n_left >= 4; n_left -= 4) { | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
BLASLONG m_left = M; | |||
for (; m_left >= 4; m_left -= 4) { | |||
kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai); | |||
a_ += 8 * K; | |||
c_ += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai); | |||
a_ += 4 * K; | |||
c_ += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai); | |||
} | |||
sb += 8 * K; | |||
C += 8 * LDC; | |||
} | |||
if (n_left >= 2) { | |||
n_left -= 2; | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
BLASLONG m_left = M; | |||
for (; m_left >= 4; m_left -= 4) { | |||
kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai); | |||
a_ += 8 * K; | |||
c_ += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai); | |||
a_ += 4 * K; | |||
c_ += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai); | |||
} | |||
sb += 4 * K; | |||
C += 4 * LDC; | |||
} | |||
if (n_left) { | |||
const FLOAT *a_ = sa; | |||
FLOAT *c_ = C; | |||
BLASLONG m_left = M; | |||
for (; m_left >= 4; m_left -= 4) { | |||
kernel_4x1(a_, sb, c_, K, alphar, alphai); | |||
a_ += 8 * K; | |||
c_ += 8; | |||
} | |||
if (m_left >= 2) { | |||
m_left -= 2; | |||
kernel_2x1(a_, sb, c_, K, alphar, alphai); | |||
a_ += 4 * K; | |||
c_ += 4; | |||
} | |||
if (m_left) { | |||
kernel_1x1(a_, sb, c_, K, alphar, alphai); | |||
} | |||
} | |||
return 0; | |||
} | |||
@@ -0,0 +1,160 @@ | |||
SGEMM_BETA = ../generic/gemm_beta.c | |||
DGEMM_BETA = ../generic/gemm_beta.c | |||
CGEMM_BETA = ../generic/zgemm_beta.c | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
#Pure C for other kernels | |||
SAMAXKERNEL = ../mips/amax.c | |||
DAMAXKERNEL = ../mips/amax.c | |||
CAMAXKERNEL = ../mips/zamax.c | |||
ZAMAXKERNEL = ../mips/zamax.c | |||
SAMINKERNEL = ../mips/amin.c | |||
DAMINKERNEL = ../mips/amin.c | |||
CAMINKERNEL = ../mips/zamin.c | |||
ZAMINKERNEL = ../mips/zamin.c | |||
SMAXKERNEL = ../mips/max.c | |||
DMAXKERNEL = ../mips/max.c | |||
SMINKERNEL = ../mips/min.c | |||
DMINKERNEL = ../mips/min.c | |||
ISAMAXKERNEL = ../mips/iamax.c | |||
IDAMAXKERNEL = ../mips/iamax.c | |||
ICAMAXKERNEL = ../mips/izamax.c | |||
IZAMAXKERNEL = ../mips/izamax.c | |||
ISAMINKERNEL = ../mips/iamin.c | |||
IDAMINKERNEL = ../mips/iamin.c | |||
ICAMINKERNEL = ../mips/izamin.c | |||
IZAMINKERNEL = ../mips/izamin.c | |||
ISMAXKERNEL = ../mips/imax.c | |||
IDMAXKERNEL = ../mips/imax.c | |||
ISMINKERNEL = ../mips/imin.c | |||
IDMINKERNEL = ../mips/imin.c | |||
SASUMKERNEL = ../mips/asum.c | |||
DASUMKERNEL = ../mips/asum.c | |||
CASUMKERNEL = ../mips/zasum.c | |||
ZASUMKERNEL = ../mips/zasum.c | |||
SSUMKERNEL = ../mips/sum.c | |||
DSUMKERNEL = ../mips/sum.c | |||
CSUMKERNEL = ../mips/zsum.c | |||
ZSUMKERNEL = ../mips/zsum.c | |||
SAXPYKERNEL = ../mips/axpy.c | |||
DAXPYKERNEL = ../mips/axpy.c | |||
CAXPYKERNEL = ../mips/zaxpy.c | |||
ZAXPYKERNEL = ../mips/zaxpy.c | |||
SCOPYKERNEL = ../mips/copy.c | |||
DCOPYKERNEL = ../mips/copy.c | |||
CCOPYKERNEL = ../mips/zcopy.c | |||
ZCOPYKERNEL = ../mips/zcopy.c | |||
SDOTKERNEL = ../mips/dot.c | |||
DDOTKERNEL = ../mips/dot.c | |||
CDOTKERNEL = ../mips/zdot.c | |||
ZDOTKERNEL = ../mips/zdot.c | |||
SNRM2KERNEL = ../mips/nrm2.c | |||
DNRM2KERNEL = ../mips/nrm2.c | |||
CNRM2KERNEL = ../mips/znrm2.c | |||
ZNRM2KERNEL = ../mips/znrm2.c | |||
SROTKERNEL = ../mips/rot.c | |||
DROTKERNEL = ../mips/rot.c | |||
CROTKERNEL = ../mips/zrot.c | |||
ZROTKERNEL = ../mips/zrot.c | |||
SSCALKERNEL = ../mips/scal.c | |||
DSCALKERNEL = ../mips/scal.c | |||
CSCALKERNEL = ../mips/zscal.c | |||
ZSCALKERNEL = ../mips/zscal.c | |||
SSWAPKERNEL = ../mips/swap.c | |||
DSWAPKERNEL = ../mips/swap.c | |||
CSWAPKERNEL = ../mips/zswap.c | |||
ZSWAPKERNEL = ../mips/zswap.c | |||
SGEMVNKERNEL = ../mips/gemv_n.c | |||
DGEMVNKERNEL = ../mips/gemv_n.c | |||
CGEMVNKERNEL = ../mips/zgemv_n.c | |||
ZGEMVNKERNEL = ../mips/zgemv_n.c | |||
SGEMVTKERNEL = ../mips/gemv_t.c | |||
DGEMVTKERNEL = ../mips/gemv_t.c | |||
CGEMVTKERNEL = ../mips/zgemv_t.c | |||
ZGEMVTKERNEL = ../mips/zgemv_t.c | |||
SSYMV_U_KERNEL = ../generic/symv_k.c | |||
SSYMV_L_KERNEL = ../generic/symv_k.c | |||
DSYMV_U_KERNEL = ../generic/symv_k.c | |||
DSYMV_L_KERNEL = ../generic/symv_k.c | |||
QSYMV_U_KERNEL = ../generic/symv_k.c | |||
QSYMV_L_KERNEL = ../generic/symv_k.c | |||
CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c |
@@ -1,7 +1,6 @@ | |||
ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
ifeq ($(HAVE_GAS), 1) | |||
include $(KERNELDIR)/KERNEL.POWER8 | |||
else | |||
#SGEMM_BETA = ../generic/gemm_beta.c | |||
#DGEMM_BETA = ../generic/gemm_beta.c | |||
#CGEMM_BETA = ../generic/zgemm_beta.c | |||
@@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c | |||
SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c | |||
SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c | |||
SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c | |||
SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c | |||
SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c | |||
SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c | |||
SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c | |||
SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c | |||
DGEMMKERNEL = dgemm_kernel_power10.c | |||
DGEMMINCOPY = | |||
DGEMMITCOPY = | |||
@@ -43,7 +52,18 @@ DGEMMITCOPYOBJ = | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c | |||
DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c | |||
DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c | |||
DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c | |||
DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c | |||
DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c | |||
DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c | |||
DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c | |||
DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c | |||
CGEMMKERNEL = cgemm_kernel_power10.S | |||
#CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
@@ -218,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
endif |
@@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
#endif | |||
const float *mvecp = mvec; | |||
/* We have to load reverse mask for big endian. */ | |||
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
__vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
#else | |||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
#endif | |||
long ytmp; | |||
__asm__ | |||
@@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
"xvmaddasp 38, 58, 33 \n\t" | |||
"xvmaddasp 39, 59, 33 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"stxv 48, 0(%4) \n\t" | |||
"stxv 49, 16(%4) \n\t" | |||
"stxv 50, 32(%4) \n\t" | |||
"stxv 51, 48(%4) \n\t" | |||
"stxv 34, 64(%4) \n\t" | |||
"stxv 35, 80(%4) \n\t" | |||
"stxv 38, 96(%4) \n\t" | |||
"stxv 39, 112(%4) \n\t" | |||
#else | |||
"stxv 49, 0(%4) \n\t" | |||
"stxv 48, 16(%4) \n\t" | |||
"stxv 51, 32(%4) \n\t" | |||
@@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
"stxv 34, 80(%4) \n\t" | |||
"stxv 39, 96(%4) \n\t" | |||
"stxv 38, 112(%4) \n\t" | |||
#endif | |||
"addi %4, %4, 128 \n\t" | |||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part | |||
@@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
"xvmaddasp 38, 58, 33 \n\t" | |||
"xvmaddasp 39, 59, 33 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"stxv 48, 0(%4) \n\t" | |||
"stxv 49, 16(%4) \n\t" | |||
"stxv 50, 32(%4) \n\t" | |||
"stxv 51, 48(%4) \n\t" | |||
"stxv 34, 64(%4) \n\t" | |||
"stxv 35, 80(%4) \n\t" | |||
"stxv 38, 96(%4) \n\t" | |||
"stxv 39, 112(%4) \n\t" | |||
#else | |||
"stxv 49, 0(%4) \n\t" | |||
"stxv 48, 16(%4) \n\t" | |||
"stxv 51, 32(%4) \n\t" | |||
@@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
"stxv 34, 80(%4) \n\t" | |||
"stxv 39, 96(%4) \n\t" | |||
"stxv 38, 112(%4) \n\t" | |||
#endif | |||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" | |||
: | |||
@@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
".align 5 \n" | |||
"one%=: \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"stxv 32, 0(%3) \n\t" | |||
"stxv 33, 16(%3) \n\t" | |||
"stxv 34, 32(%3) \n\t" | |||
"stxv 35, 48(%3) \n\t" | |||
"stxv 36, 64(%3) \n\t" | |||
"stxv 37, 80(%3) \n\t" | |||
"stxv 38, 96(%3) \n\t" | |||
"stxv 39, 112(%3) \n\t" | |||
#else | |||
"stxv 33, 0(%3) \n\t" | |||
"stxv 32, 16(%3) \n\t" | |||
"stxv 35, 32(%3) \n\t" | |||
@@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
"stxv 36, 80(%3) \n\t" | |||
"stxv 39, 96(%3) \n\t" | |||
"stxv 38, 112(%3) \n\t" | |||
#endif | |||
"lxvp 32, 0(%2) \n\t" | |||
"lxvp 34, 32(%2) \n\t" | |||
"lxvp 36, 64(%2) \n\t" | |||
"lxvp 38, 96(%2) \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"stxv 40, 128(%3) \n\t" | |||
"stxv 41, 144(%3) \n\t" | |||
"stxv 42, 160(%3) \n\t" | |||
"stxv 43, 176(%3) \n\t" | |||
"stxv 44, 192(%3) \n\t" | |||
"stxv 45, 208(%3) \n\t" | |||
"stxv 46, 224(%3) \n\t" | |||
"stxv 47, 240(%3) \n\t" | |||
#else | |||
"stxv 41, 128(%3) \n\t" | |||
"stxv 40, 144(%3) \n\t" | |||
"stxv 43, 160(%3) \n\t" | |||
@@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
"stxv 44, 208(%3) \n\t" | |||
"stxv 47, 224(%3) \n\t" | |||
"stxv 46, 240(%3) \n\t" | |||
#endif | |||
"lxvp 40, 128(%2) \n\t" | |||
"lxvp 42, 160(%2) \n\t" | |||
"lxvp 44, 192(%2) \n\t" | |||
@@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
"bgt one%= \n" | |||
"two%=: \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"stxv 32, 0(%3) \n\t" | |||
"stxv 33, 16(%3) \n\t" | |||
"stxv 34, 32(%3) \n\t" | |||
"stxv 35, 48(%3) \n\t" | |||
"stxv 36, 64(%3) \n\t" | |||
"stxv 37, 80(%3) \n\t" | |||
"stxv 38, 96(%3) \n\t" | |||
"stxv 39, 112(%3) \n\t" | |||
"stxv 40, 128(%3) \n\t" | |||
"stxv 41, 144(%3) \n\t" | |||
"stxv 42, 160(%3) \n\t" | |||
"stxv 43, 176(%3) \n\t" | |||
"stxv 44, 192(%3) \n\t" | |||
"stxv 45, 208(%3) \n\t" | |||
"stxv 46, 224(%3) \n\t" | |||
"stxv 47, 240(%3) \n\t" | |||
#else | |||
"stxv 33, 0(%3) \n\t" | |||
"stxv 32, 16(%3) \n\t" | |||
"stxv 35, 32(%3) \n\t" | |||
@@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
"stxv 44, 208(%3) \n\t" | |||
"stxv 47, 224(%3) \n\t" | |||
"stxv 46, 240(%3) \n\t" | |||
#endif | |||
"#n=%1 x=%4=%2 y=%0=%3" | |||
: | |||
"=m" (*y), | |||
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#else | |||
#include "common.h" | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
#include "cdot_microk_power10.c" | |||
#else | |||
#ifndef HAVE_KERNEL_8 | |||
@@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
if ((inc_x == 1) && (inc_y == 1)) { | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
BLASLONG n1 = n & -16; | |||
#else | |||
BLASLONG n1 = n & -8; | |||
@@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
{ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
__vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
#else | |||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
#endif | |||
__asm__ | |||
( | |||
"dcbt 0, %2 \n\t" | |||
@@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
"xxswapd 33, 34 \n\t" | |||
"xvaddsp 35, 35, 32 \n\t" | |||
"xvaddsp 34, 34, 33 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xxpermdi 34, 35, 34, 0 \n\t" | |||
#else | |||
"xxpermdi 34, 34, 35, 2 \n\t" | |||
#endif | |||
"stxv 34, 0(%6) \n\t" | |||
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||
@@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include "cgemm_macros_power10.S" | |||
#if (_AIX) | |||
.set perm_const1, 0x0405060700010203 | |||
.set perm_const2, 0x0c0d0e0f08090a0b | |||
.set save_permute_12, 0x1011121300010203 | |||
.set save_permute_11, 0x18191a1b08090a0b | |||
#else | |||
.equ perm_const1, 0x0405060700010203 | |||
.equ perm_const2, 0x0c0d0e0f08090a0b | |||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
.equ save_permute_11, 0x0405060714151617 | |||
#endif | |||
#ifndef NEEDPARAM | |||
@@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/*load reverse permute mask for big endian | |||
uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
*/ | |||
#if (_AIX) | |||
lis T2, (perm_const2>>48 & 0xFFFF) | |||
lis T1, (perm_const1>>48 & 0xFFFF) | |||
lis T3, (save_permute_12>>48 & 0xFFFF) | |||
lis T4, (save_permute_11>>48 & 0xFFFF) | |||
ori T2, T2, (perm_const2>>32 & 0xFFFF) | |||
ori T1, T1, (perm_const1>>32 & 0xFFFF) | |||
ori T3, T3, (save_permute_12>>32 & 0xFFFF) | |||
ori T4, T4, (save_permute_11>>32 & 0xFFFF) | |||
#else | |||
lis T2, perm_const2@highest | |||
lis T1, perm_const1@highest | |||
lis T3, save_permute_12@highest | |||
lis T4, save_permute_11@highest | |||
ori T2, T2, perm_const2@higher | |||
ori T1, T1, perm_const1@higher | |||
ori T3, T3, save_permute_12@higher | |||
ori T4, T4, save_permute_11@higher | |||
#endif | |||
rldicr T2, T2, 32, 31 | |||
rldicr T1, T1, 32, 31 | |||
rldicr T3, T3, 32, 31 | |||
rldicr T4, T4, 32, 31 | |||
#if (_AIX) | |||
oris T2, T2, (perm_const2>>16 & 0xFFFF) | |||
oris T1, T1, (perm_const1>>16 & 0xFFFF) | |||
oris T3, T3, (save_permute_12>>16 & 0xFFFF) | |||
oris T4, T4, (save_permute_11>>16 & 0xFFFF) | |||
ori T2, T2, (perm_const2 & 0xFFFF) | |||
ori T1, T1, (perm_const1 & 0xFFFF) | |||
ori T3, T3, (save_permute_12 & 0xFFFF) | |||
ori T4, T4, (save_permute_11 & 0xFFFF) | |||
#else | |||
oris T2, T2, perm_const2@h | |||
oris T1, T1, perm_const1@h | |||
oris T3, T3, save_permute_12@h | |||
@@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ori T1, T1, perm_const1@l | |||
ori T3, T3, save_permute_12@l | |||
ori T4, T4, save_permute_11@l | |||
#endif | |||
li r0,0 | |||
li PRE,512 | |||
@@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.if \OffsetA != 0 | |||
addi \AREG, \AREG, \OffsetA | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 3, 36, 34 | |||
xvf32gerpp 2, 37, 34 | |||
xvf32gerpp 1, 32, 34 | |||
xvf32gerpp 0, 33, 34 | |||
xvf32gerpp 7, 36, 35 | |||
xvf32gerpp 6, 37, 35 | |||
xvf32gerpp 5, 32, 35 | |||
xvf32gerpp 4, 33, 35 | |||
#else | |||
xvf32gerpp 3, 36, 35 | |||
xvf32gerpp 2, 37, 35 | |||
xvf32gerpp 1, 32, 35 | |||
@@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvf32gerpp 6, 37, 34 | |||
xvf32gerpp 5, 32, 34 | |||
xvf32gerpp 4, 33, 34 | |||
#endif | |||
.endm | |||
.macro LOAD4x8_2 | |||
@@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
xvf32gerpp 3, 36, 34 | |||
xvf32gerpp 2, 37, 34 | |||
xvf32gerpp 1, 32, 34 | |||
xvf32gerpp 0, 33, 34 | |||
xvf32gerpp 7, 36, 35 | |||
xvf32gerpp 6, 37, 35 | |||
xvf32gerpp 5, 32, 35 | |||
xvf32gerpp 4, 33, 35 | |||
#else | |||
xvf32gerpp 3, 36, 35 | |||
xvf32gerpp 2, 37, 35 | |||
xvf32gerpp 1, 32, 35 | |||
@@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvf32gerpp 6, 37, 34 | |||
xvf32gerpp 5, 32, 34 | |||
xvf32gerpp 4, 33, 34 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
.endif | |||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
xvf32gerpp 3, 42, 38 | |||
xvf32gerpp 2, 43, 38 | |||
xvf32gerpp 1, 40, 38 | |||
xvf32gerpp 0, 41, 38 | |||
xvf32gerpp 7, 42, 39 | |||
xvf32gerpp 6, 43, 39 | |||
xvf32gerpp 5, 40, 39 | |||
xvf32gerpp 4, 41, 39 | |||
#else | |||
xvf32gerpp 3, 42, 39 | |||
xvf32gerpp 2, 43, 39 | |||
xvf32gerpp 1, 40, 39 | |||
@@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvf32gerpp 6, 43, 38 | |||
xvf32gerpp 5, 40, 38 | |||
xvf32gerpp 4, 41, 38 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) | |||
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
@@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
RECONSTRUCT_PAIR2 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 1 | |||
xxpermdi vs3, vs2, vs10, 1 | |||
xxpermdi vs5, vs4, vs12, 1 | |||
xxpermdi vs7, vs6, vs14, 1 | |||
xxpermdi vs9, vs8, vs0, 1 | |||
xxpermdi vs11, vs10, vs2, 1 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 2 | |||
xxpermdi vs3, vs10, vs2, 2 | |||
xxpermdi vs5, vs12, vs4, 2 | |||
xxpermdi vs7, vs14, vs6, 2 | |||
xxpermdi vs9, vs0, vs8, 2 | |||
xxpermdi vs11, vs2, vs10, 2 | |||
#endif | |||
xvaddsp vs24, vs24, vs3 | |||
xvaddsp vs25, vs25, vs1 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs13, vs12, vs4, 1 | |||
xxpermdi vs15, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs13, vs4, vs12, 2 | |||
xxpermdi vs15, vs6, vs14, 2 | |||
#endif | |||
xvaddsp vs26, vs26, vs7 | |||
xvaddsp vs27, vs27, vs5 | |||
xvaddsp vs28, vs28, vs11 | |||
xvaddsp vs29, vs29, vs9 | |||
xvaddsp vs30, vs30, vs15 | |||
xvaddsp vs31, vs31, vs13 | |||
#else | |||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
xxpermdi vs25, vs0, vs8, 1 | |||
xxpermdi vs24, vs2, vs10, 1 | |||
xxpermdi vs27, vs4, vs12, 1 | |||
xxpermdi vs26, vs6, vs14, 1 | |||
xxpermdi vs29, vs8, vs0, 1 | |||
xxpermdi vs28, vs10, vs2, 1 | |||
xxpermdi vs31, vs12, vs4, 1 | |||
xxpermdi vs30, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs25, vs8, vs0, 2 | |||
xxpermdi vs24, vs10, vs2, 2 | |||
@@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxpermdi vs28, vs2, vs10, 2 | |||
xxpermdi vs31, vs4, vs12, 2 | |||
xxpermdi vs30, vs6, vs14, 2 | |||
#endif | |||
#endif | |||
stxvp vs24, 0(CO) | |||
MULT_APLHA_PART1 vs48, vs56, vs0, vs1 | |||
@@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
RECONSTRUCT_PAIR2 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 1 | |||
xxpermdi vs3, vs2, vs10, 1 | |||
xxpermdi vs5, vs4, vs12, 1 | |||
xxpermdi vs7, vs6, vs14, 1 | |||
xxpermdi vs9, vs8, vs0, 1 | |||
xxpermdi vs11, vs10, vs2, 1 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 2 | |||
xxpermdi vs3, vs10, vs2, 2 | |||
xxpermdi vs5, vs12, vs4, 2 | |||
xxpermdi vs7, vs14, vs6, 2 | |||
xxpermdi vs9, vs0, vs8, 2 | |||
xxpermdi vs11, vs2, vs10, 2 | |||
#endif | |||
xvaddsp vs32, vs32, vs3 | |||
xvaddsp vs33, vs33, vs1 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs13, vs12, vs4, 1 | |||
xxpermdi vs15, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs13, vs4, vs12, 2 | |||
xxpermdi vs15, vs6, vs14, 2 | |||
#endif | |||
xvaddsp vs40, vs40, vs7 | |||
xvaddsp vs41, vs41, vs5 | |||
xvaddsp vs34, vs34, vs11 | |||
xvaddsp vs35, vs35, vs9 | |||
xvaddsp vs42, vs42, vs15 | |||
xvaddsp vs43, vs43, vs13 | |||
#else | |||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
xxpermdi vs33, vs0, vs8, 1 | |||
xxpermdi vs32, vs2, vs10, 1 | |||
xxpermdi vs41, vs4, vs12, 1 | |||
xxpermdi vs40, vs6, vs14, 1 | |||
xxpermdi vs35, vs8, vs0, 1 | |||
xxpermdi vs34, vs10, vs2, 1 | |||
xxpermdi vs43, vs12, vs4, 1 | |||
xxpermdi vs42, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs33, vs8, vs0, 2 | |||
xxpermdi vs32, vs10, vs2, 2 | |||
@@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxpermdi vs34, vs2, vs10, 2 | |||
xxpermdi vs43, vs4, vs12, 2 | |||
xxpermdi vs42, vs6, vs14, 2 | |||
#endif | |||
#endif | |||
stxvp vs32, 0(T2) | |||
stxvp vs40, 32(T2) | |||
@@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.if \OffsetA != 0 | |||
addi \AREG, \AREG, \OffsetA | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 3, 32, 35 | |||
xvf32gerpp 2, 33, 35 | |||
xvf32gerpp 1, 32, 34 | |||
xvf32gerpp 0, 33, 34 | |||
#else | |||
xvf32gerpp 3, 32, 34 | |||
xvf32gerpp 2, 33, 34 | |||
xvf32gerpp 1, 32, 35 | |||
xvf32gerpp 0, 33, 35 | |||
#endif | |||
.endm | |||
.macro LOAD4x4_2 | |||
@@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 3, 32, 35 | |||
xvf32gerpp 2, 33, 35 | |||
xvf32gerpp 1, 32, 34 | |||
xvf32gerpp 0, 33, 34 | |||
#else | |||
xvf32gerpp 3, 32, 34 | |||
xvf32gerpp 2, 33, 34 | |||
xvf32gerpp 1, 32, 35 | |||
xvf32gerpp 0, 33, 35 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 3, 36, 39 | |||
xvf32gerpp 2, 37, 39 | |||
xvf32gerpp 1, 36, 38 | |||
xvf32gerpp 0, 37, 38 | |||
#else | |||
xvf32gerpp 3, 36, 38 | |||
xvf32gerpp 2, 37, 38 | |||
xvf32gerpp 1, 36, 39 | |||
xvf32gerpp 0, 37, 39 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
@@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
RECONSTRUCT_PAIR2 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 1 | |||
xxpermdi vs3, vs2, vs10, 1 | |||
xxpermdi vs9, vs8, vs0, 1 | |||
xxpermdi vs11, vs10, vs2, 1 | |||
xxpermdi vs5, vs4, vs12, 1 | |||
xxpermdi vs7, vs6, vs14, 1 | |||
xxpermdi vs13, vs12, vs4, 1 | |||
xxpermdi vs15, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 2 | |||
xxpermdi vs3, vs10, vs2, 2 | |||
xxpermdi vs9, vs0, vs8, 2 | |||
@@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxpermdi vs7, vs14, vs6, 2 | |||
xxpermdi vs13, vs4, vs12, 2 | |||
xxpermdi vs15, vs6, vs14, 2 | |||
#endif | |||
xvaddsp vs24, vs24, vs3 | |||
xvaddsp vs25, vs25, vs1 | |||
xvaddsp vs26, vs26, vs11 | |||
@@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvaddsp vs29, vs29, vs5 | |||
xvaddsp vs30, vs30, vs15 | |||
xvaddsp vs31, vs31, vs13 | |||
#else | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs25, vs0, vs8, 1 | |||
xxpermdi vs24, vs2, vs10, 1 | |||
xxpermdi vs27, vs8, vs0, 1 | |||
xxpermdi vs26, vs10, vs2, 1 | |||
xxpermdi vs29, vs4, vs12, 1 | |||
xxpermdi vs28, vs6, vs14, 1 | |||
xxpermdi vs31, vs12, vs4, 1 | |||
xxpermdi vs30, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs25, vs8, vs0, 2 | |||
xxpermdi vs24, vs10, vs2, 2 | |||
@@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxpermdi vs28, vs14, vs6, 2 | |||
xxpermdi vs31, vs4, vs12, 2 | |||
xxpermdi vs30, vs6, vs14, 2 | |||
#endif | |||
#endif | |||
stxvp vs24, 0(CO) | |||
stxvp vs26, 0(T1) | |||
@@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.if \OffsetA != 0 | |||
addi \AREG, \AREG, \OffsetA | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 1, 35, 32 | |||
xvf32gerpp 0, 34, 32 | |||
#else | |||
xvf32gerpp 1, 34, 32 | |||
xvf32gerpp 0, 35, 32 | |||
#endif | |||
.endm | |||
.macro LOAD4x2_2 | |||
@@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 1, 35, 32 | |||
xvf32gerpp 0, 34, 32 | |||
#else | |||
xvf32gerpp 1, 34, 33 | |||
xvf32gerpp 0, 35, 33 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 1, 37, 33 | |||
xvf32gerpp 0, 36, 33 | |||
#else | |||
xvf32gerpp 1, 36, 32 | |||
xvf32gerpp 0, 37, 32 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) | |||
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
@@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
RECONSTRUCT_PAIR1 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 0 | |||
xxpermdi vs9, vs2, vs10, 0 | |||
xxpermdi vs3, vs8, vs0, 3 | |||
xxpermdi vs11, vs10, vs2, 3 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 0 | |||
xxpermdi vs9, vs10, vs2, 0 | |||
xxpermdi vs3, vs0, vs8, 3 | |||
xxpermdi vs11, vs2, vs10, 3 | |||
#endif | |||
xvaddsp vs24, vs24, vs1 | |||
xvaddsp vs26, vs26, vs9 | |||
xvaddsp vs25, vs25, vs3 | |||
xvaddsp vs27, vs27, vs11 | |||
#else | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs24, vs0, vs8, 0 | |||
xxpermdi vs26, vs2, vs10, 0 | |||
xxpermdi vs25, vs8, vs0, 3 | |||
xxpermdi vs27, vs10, vs2, 3 | |||
#else | |||
xxpermdi vs24, vs8, vs0, 0 | |||
xxpermdi vs26, vs10, vs2, 0 | |||
xxpermdi vs25, vs0, vs8, 3 | |||
xxpermdi vs27, vs2, vs10, 3 | |||
#endif | |||
#endif | |||
stxv vs24, 0(CO) | |||
stxv vs25, 0(T1) | |||
@@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.if \OffsetA != 0 | |||
addi \AREG, \AREG, \OffsetA | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 0, 34, 32 | |||
xvf32gerpp 1, 35, 32 | |||
#else | |||
xvf32gerpp 0, 35, 32 | |||
xvf32gerpp 1, 34, 32 | |||
#endif | |||
.endm | |||
.macro LOAD4x1_2 | |||
@@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro LOAD4x1_2O OffsetA, OffsetB | |||
lxv vs32, (\OffsetA)(AO) | |||
vspltisb v6, 0 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs33, vs32, vs38, 2 | |||
xxpermdi vs32, vs32, vs38, 0 | |||
#else | |||
xxpermdi vs33, vs32, vs38, 0 | |||
xxpermdi vs32, vs32, vs38, 2 | |||
#endif | |||
lxvp vs34, (0+\OffsetB)(BO) | |||
lxvp vs36, (32+\OffsetB)(BO) | |||
.endm | |||
@@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 0, 34, 32 | |||
xvf32gerpp 1, 35, 32 | |||
#else | |||
xvf32gerpp 0, 35, 32 | |||
xvf32gerpp 1, 34, 32 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 0, 36, 33 | |||
xvf32gerpp 1, 37, 33 | |||
#else | |||
xvf32gerpp 0, 37, 33 | |||
xvf32gerpp 1, 36, 33 | |||
#endif | |||
.if \Complete==0 | |||
lxv vs32, DISP2(\Index, \OffsetA)(\AREG) | |||
lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs33, vs32, vs38, 2 | |||
xxpermdi vs32, vs32, vs38, 0 | |||
#else | |||
xxpermdi vs33, vs32, vs38, 0 | |||
xxpermdi vs32, vs32, vs38, 2 | |||
#endif | |||
.endif | |||
.if \IsLast==1 | |||
.if \Complete==1 | |||
@@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 2, 37, 34 | |||
xvf32gerpp 3, 36, 34 | |||
xvf32gerpp 0, 33, 34 | |||
xvf32gerpp 1, 32, 34 | |||
#else | |||
xvf32gerpp 2, 37, 35 | |||
xvf32gerpp 3, 36, 35 | |||
xvf32gerpp 0, 33, 35 | |||
xvf32gerpp 1, 32, 35 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 2, 41, 35 | |||
xvf32gerpp 3, 40, 35 | |||
xvf32gerpp 0, 39, 35 | |||
xvf32gerpp 1, 38, 35 | |||
#else | |||
xvf32gerpp 2, 41, 34 | |||
xvf32gerpp 3, 40, 34 | |||
xvf32gerpp 0, 39, 34 | |||
xvf32gerpp 1, 38, 34 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
@@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
RECONSTRUCT_PAIR2 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 1 | |||
xxpermdi vs3, vs2, vs10, 1 | |||
xxpermdi vs5, vs4, vs12, 1 | |||
xxpermdi vs7, vs6, vs14, 1 | |||
xxpermdi vs9, vs8, vs0, 1 | |||
xxpermdi vs11, vs10, vs2, 1 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 2 | |||
xxpermdi vs3, vs10, vs2, 2 | |||
xxpermdi vs5, vs12, vs4, 2 | |||
xxpermdi vs7, vs14, vs6, 2 | |||
xxpermdi vs9, vs0, vs8, 2 | |||
xxpermdi vs11, vs2, vs10, 2 | |||
#endif | |||
xvaddsp vs24, vs24, vs3 | |||
xvaddsp vs25, vs25, vs1 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs13, vs12, vs4, 1 | |||
xxpermdi vs15, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs13, vs4, vs12, 2 | |||
xxpermdi vs15, vs6, vs14, 2 | |||
#endif | |||
xvaddsp vs26, vs26, vs7 | |||
xvaddsp vs27, vs27, vs5 | |||
xvaddsp vs28, vs28, vs11 | |||
@@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvaddsp vs30, vs30, vs15 | |||
xvaddsp vs31, vs31, vs13 | |||
#else | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs25, vs0, vs8, 1 | |||
xxpermdi vs24, vs2, vs10, 1 | |||
xxpermdi vs27, vs4, vs12, 1 | |||
xxpermdi vs26, vs6, vs14, 1 | |||
xxpermdi vs29, vs8, vs0, 1 | |||
xxpermdi vs28, vs10, vs2, 1 | |||
xxpermdi vs31, vs12, vs4, 1 | |||
xxpermdi vs30, vs14, vs6, 1 | |||
#else | |||
xxpermdi vs25, vs8, vs0, 2 | |||
xxpermdi vs24, vs10, vs2, 2 | |||
xxpermdi vs27, vs12, vs4, 2 | |||
@@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxpermdi vs28, vs2, vs10, 2 | |||
xxpermdi vs31, vs4, vs12, 2 | |||
xxpermdi vs30, vs6, vs14, 2 | |||
#endif | |||
#endif | |||
stxvp vs24, 0(CO) | |||
stxvp vs26, 32(CO) | |||
@@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 0, 33, 34 | |||
xvf32gerpp 1, 32, 34 | |||
#else | |||
xvf32gerpp 0, 33, 35 | |||
xvf32gerpp 1, 32, 35 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
.endif | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xvf32gerpp 0, 37, 35 | |||
xvf32gerpp 1, 36, 35 | |||
#else | |||
xvf32gerpp 0, 37, 34 | |||
xvf32gerpp 1, 36, 34 | |||
#endif | |||
.if \Complete==0 | |||
lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
@@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
RECONSTRUCT_PAIR1 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 1 | |||
xxpermdi vs3, vs2, vs10, 1 | |||
xxpermdi vs9, vs8, vs0, 1 | |||
xxpermdi vs11, vs10, vs2, 1 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 2 | |||
xxpermdi vs3, vs10, vs2, 2 | |||
xxpermdi vs9, vs0, vs8, 2 | |||
xxpermdi vs11, vs2, vs10, 2 | |||
#endif | |||
xvaddsp vs24, vs24, vs3 | |||
xvaddsp vs25, vs25, vs1 | |||
xvaddsp vs26, vs26, vs11 | |||
xvaddsp vs27, vs27, vs9 | |||
#else | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs25, vs0, vs8, 1 | |||
xxpermdi vs24, vs2, vs10, 1 | |||
xxpermdi vs27, vs8, vs0, 1 | |||
xxpermdi vs26, vs10, vs2, 1 | |||
#else | |||
xxpermdi vs25, vs8, vs0, 2 | |||
xxpermdi vs24, vs10, vs2, 2 | |||
xxpermdi vs27, vs0, vs8, 2 | |||
xxpermdi vs26, vs2, vs10, 2 | |||
#endif | |||
#endif | |||
stxvp vs24, 0(CO) | |||
stxvp vs26, 0(T1) | |||
@@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xxperm vs8, vs9, save_permute_1 | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs1, vs0, vs8, 0 | |||
xxpermdi vs9, vs8, vs0, 3 | |||
#else | |||
xxpermdi vs1, vs8, vs0, 0 | |||
xxpermdi vs9, vs0, vs8, 3 | |||
#endif | |||
xvaddsp vs24, vs24, vs1 | |||
xvaddsp vs26, vs26, vs9 | |||
#else | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs24, vs0, vs8, 0 | |||
xxpermdi vs26, vs8, vs0, 3 | |||
#else | |||
xxpermdi vs24, vs8, vs0, 0 | |||
xxpermdi vs26, vs0, vs8, 3 | |||
#endif | |||
#endif | |||
stxv vs24, 0(CO) | |||
stxv vs26, 0(T1) | |||
@@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
lxvp vs32, (0+\OffsetA)(AO) | |||
lxvp vs36, (32+\OffsetA)(AO) | |||
vspltisb v10, 0 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs35, vs34, vs42, 2 | |||
xxpermdi vs34, vs34, vs42, 0 | |||
#else | |||
xxpermdi vs35, vs34, vs42, 0 | |||
xxpermdi vs34, vs34, vs42, 2 | |||
#endif | |||
lxvp vs38, (64+\OffsetA)(AO) | |||
lxvp vs40, (64+32+\OffsetA)(AO) | |||
.endm | |||
@@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvf32gerpp 3, 35, 40 | |||
.if \Complete==0 | |||
lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs35, vs34, vs42, 2 | |||
xxpermdi vs34, vs34, vs42, 0 | |||
#else | |||
xxpermdi vs35, vs34, vs42, 0 | |||
xxpermdi vs34, vs34, vs42, 2 | |||
#endif | |||
lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) | |||
.endif | |||
.if \IsLast==1 | |||
@@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
MULT_APLHA_PART2 vs34, vs42, vs4, vs5 | |||
MULT_APLHA_PART2 vs35, vs43, vs6, vs7 | |||
/* reconstruct r, i pairs*/ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxperm vs0, vs1, save_permute_1 | |||
xxperm vs2, vs3, save_permute_1 | |||
xxperm vs4, vs5, save_permute_1 | |||
xxperm vs6, vs7, save_permute_1 | |||
#else | |||
xxperm vs0, vs1, vs28 | |||
xxperm vs2, vs3, vs28 | |||
xxperm vs4, vs5, vs28 | |||
xxperm vs6, vs7, vs28 | |||
#endif | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
xvaddsp vs24, vs24, vs2 | |||
@@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stxvp vs26, 32(CO) | |||
#else | |||
/* reconstruct r, i pairs*/ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
stxv vs2, 0(CO) | |||
stxv vs0, 16(CO) | |||
stxv vs6, 32(CO) | |||
stxv vs4, 48(CO) | |||
#else | |||
stxv vs0, 0(CO) | |||
stxv vs2, 16(CO) | |||
stxv vs4, 32(CO) | |||
stxv vs6, 48(CO) | |||
#endif | |||
#endif | |||
addi CO, CO, 64 | |||
.endm | |||
@@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
lxv vs34, (\OffsetB)(BO) | |||
lxvp vs32, (0+\OffsetA)(AO) | |||
vspltisb v6, 0 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs35, vs34, vs38, 2 | |||
xxpermdi vs34, vs34, vs38, 0 | |||
#else | |||
xxpermdi vs35, vs34, vs38, 0 | |||
xxpermdi vs34, vs34, vs38, 2 | |||
#endif | |||
lxvp vs36, (32+\OffsetA)(AO) | |||
.endm | |||
@@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvf32gerpp 1, 35, 36 | |||
.if \Complete==0 | |||
lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxpermdi vs35, vs34, vs38, 2 | |||
xxpermdi vs34, vs34, vs38, 0 | |||
#else | |||
xxpermdi vs35, vs34, vs38, 0 | |||
xxpermdi vs34, vs34, vs38, 2 | |||
#endif | |||
lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
.endif | |||
.if \IsLast==1 | |||
@@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
MULT_APLHA_PART2 vs33, vs41, vs2, vs3 | |||
/* reconstruct r, i pairs*/ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxperm vs0, vs1, save_permute_1 | |||
xxperm vs2, vs3, save_permute_1 | |||
#else | |||
xxperm vs0, vs1, vs28 | |||
xxperm vs2, vs3, vs28 | |||
#endif | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
xvaddsp vs24, vs24, vs2 | |||
@@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stxvp vs24, 0(CO) | |||
#else | |||
/* reconstruct r, i pairs*/ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
stxv vs2, 0(CO) | |||
stxv vs0, 16(CO) | |||
#else | |||
stxv vs0, 0(CO) | |||
stxv vs2, 16(CO) | |||
#endif | |||
#endif | |||
addi CO, CO, 32 | |||
.endm | |||
@@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
MULT_APLHA_PART1 vs32, vs40, vs0, vs1 | |||
MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
/* reconstruct r, i pairs*/ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxperm vs0, vs1, save_permute_1 | |||
#else | |||
xxperm vs0, vs1, vs28 | |||
#endif | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
xvaddsp vs24, vs24, vs0 | |||
@@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
MULT_APLHA_PART1 vs32, vs40, vs37, vs1 | |||
MULT_APLHA_PART2 vs32, vs40, vs37, vs1 | |||
/* reconstruct r, i pairs*/ | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
xxperm vs37, vs1, save_permute_1 | |||
#else | |||
xxperm vs37, vs1, vs28 | |||
#endif | |||
#ifndef TRMMKERNEL | |||
/* add */ | |||
xvaddsp vs36, vs36, vs37 | |||
@@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
{ | |||
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
__vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
#else | |||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
#endif | |||
__asm__ | |||
( | |||
"dcbt 0, %2 \n\t" | |||
@@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "cswap_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "cswap_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "cswap_microk_power8.c" | |||
#include "cswap_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "dasum_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "dasum_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "dasum_microk_power8.c" | |||
#include "dasum_microk_power10.c" | |||
#endif | |||
#endif | |||
#ifndef HAVE_KERNEL_16 | |||
static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
@@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
if ( inc_x == 1 ) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 32) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
@@ -0,0 +1,923 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
typedef __vector unsigned char vec_t; | |||
#if !__has_builtin(__builtin_vsx_assemble_pair) | |||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
#endif | |||
#if !defined(B0) | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_4x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_2x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#else | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_4x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_2x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#endif | |||
#define INIT_8ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); \ | |||
__builtin_mma_xxsetaccz(&acc4); \ | |||
__builtin_mma_xxsetaccz(&acc5); \ | |||
__builtin_mma_xxsetaccz(&acc6); \ | |||
__builtin_mma_xxsetaccz(&acc7); | |||
#define INIT_4ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); | |||
#define INIT_2ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); | |||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
#if (defined(__GNUC__) && (__GNUC__ == 10)) | |||
#if defined(_AIX) | |||
#define LOAD_PAIR(pair, v0, v1) \ | |||
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
#else | |||
#define LOAD_PAIR(pair, v0, v1) \ | |||
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); | |||
#endif | |||
#else | |||
#define LOAD_PAIR(pair, v0, v1) \ | |||
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
#endif | |||
#define LOAD_A_1x8(K, M) \ | |||
ra0 = vec_xl(0, A+((K)*lda)+M+0); \ | |||
ra1 = vec_xl(0, A+((K)*lda)+M+2); \ | |||
ra2 = vec_xl(0, A+((K)*lda)+M+4); \ | |||
ra3 = vec_xl(0, A+((K)*lda)+M+6); | |||
#define LOAD_A_1x4(K, M) \ | |||
ra0 = vec_xl(0, A+((K)*lda)+M+0); \ | |||
ra1 = vec_xl(0, A+((K)*lda)+M+2); \ | |||
#define LOAD_A_1x2(K, M) \ | |||
ra0 = vec_xl(0, A+((K)*lda)+M+0); | |||
#define LOAD_A_1x1(K, M) \ | |||
ra0 = vec_splats(A[((K)*lda)+M+0]); | |||
#define LOAD_BTP_8x2(N, K) \ | |||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb0, rb1); \ | |||
t1 = vec_mergeh(rb2, rb3); \ | |||
LOAD_PAIR(pb0, t0, t1); \ | |||
t0 = vec_mergel(rb0, rb1); \ | |||
t1 = vec_mergel(rb2, rb3); \ | |||
LOAD_PAIR(pb2, t0, t1); \ | |||
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ | |||
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ | |||
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ | |||
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb4, rb5); \ | |||
t1 = vec_mergeh(rb6, rb7); \ | |||
LOAD_PAIR(pb1, t0, t1); \ | |||
t0 = vec_mergel(rb4, rb5); \ | |||
t1 = vec_mergel(rb6, rb7); \ | |||
LOAD_PAIR(pb3, t0, t1); | |||
#define LOAD_BTP_8x1(N, K) \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
rb1 = vec_xor(rb1, rb1); \ | |||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
LOAD_PAIR(pb0, rb0, rb1); \ | |||
rb2 = vec_xor(rb2, rb2); \ | |||
rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \ | |||
rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \ | |||
rb3 = vec_xor(rb3, rb3); \ | |||
rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \ | |||
rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \ | |||
LOAD_PAIR(pb1, rb2, rb3); | |||
#define LOAD_BTP_4x2(N, K) \ | |||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb0, rb1); \ | |||
t1 = vec_mergeh(rb2, rb3); \ | |||
LOAD_PAIR(pb0, t0, t1); \ | |||
t0 = vec_mergel(rb0, rb1); \ | |||
t1 = vec_mergel(rb2, rb3); \ | |||
LOAD_PAIR(pb1, t0, t1); | |||
#define LOAD_BTP_4x1(N, K) \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
rb1 = vec_xor(rb1, rb1); \ | |||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
LOAD_PAIR(pb0, rb0, rb1); | |||
#define LOAD_BTP_2x2(N, K) \ | |||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb0, rb1); \ | |||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ | |||
t1 = vec_mergel(rb0, rb1); \ | |||
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); | |||
#define LOAD_BTP_2x1(N, K) \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); | |||
#define LOAD_B_1x1(N, K) \ | |||
rb0 = vec_splats(B[((N)*ldb)+K]); | |||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
#define KERNEL_MMA_1ACC(b0, a0) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); \ | |||
result2 = vec_madd(a2, b2, result2); \ | |||
result3 = vec_madd(a3, b3, result3); | |||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); | |||
#define KERNEL_VMADD_1VSR(a0, b0) \ | |||
result = vec_madd(a0, b0, result); | |||
#define PACK_B(pb0, pb1, offset) \ | |||
*((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \ | |||
*((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1; | |||
#define LOAD_PACKED_B(pb0, pb1, offset) \ | |||
pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ | |||
pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset))); | |||
#ifdef B0 | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
#else | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG m, n, k; | |||
BLASLONG m8 = M & ~7; | |||
BLASLONG m4 = M & ~3; | |||
BLASLONG m2 = M & ~1; | |||
BLASLONG n8 = N & ~7; | |||
BLASLONG n4 = N & ~3; | |||
BLASLONG n2 = N & ~1; | |||
BLASLONG k2 = K & ~1; | |||
#if defined(__GNUC__) && !defined(__clang__) | |||
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; | |||
#else | |||
int has_packing = 0; | |||
#endif | |||
double *packB; | |||
if (has_packing) packB = (double *)malloc(K*8*sizeof(double)); | |||
vector double valpha = vec_splats(alpha); | |||
#if !defined(B0) | |||
vector double vbeta = vec_splats(beta); | |||
#endif | |||
for (n = 0; n < n8; n += 8) { | |||
for (m = 0; m < m8; m += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
INIT_8ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
if (has_packing) { | |||
if (m == 0) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
PACK_B(pb0, pb1, 0); | |||
LOAD_A_1x8(k+1, m); | |||
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
PACK_B(pb2, pb3, 8); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
PACK_B(pb0, pb1, 0); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
LOAD_A_1x8(k+1, m); | |||
LOAD_PACKED_B(pb2, pb3, 8); | |||
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
LOAD_A_1x8(k+1, m); | |||
KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
SAVE_4x2_ACC(&acc4, n+0, m+4); | |||
SAVE_4x2_ACC(&acc6, n+0, m+6); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
SAVE_4x2_ACC(&acc5, n+4, m+4); | |||
SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
} | |||
for (; m < m4; m += 4) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
if (!has_packing) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
LOAD_A_1x4(k+1, m); | |||
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
LOAD_A_1x4(k+1, m); | |||
LOAD_PACKED_B(pb2, pb3, 8); | |||
KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
} | |||
for (; m < m2; m += 2) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
if (!has_packing) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
LOAD_A_1x2(k+1, m); | |||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
LOAD_A_1x2(k+1, m); | |||
LOAD_PACKED_B(pb2, pb3, 8); | |||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; m < M; m++) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
if (!has_packing) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
LOAD_PACKED_B(pb2, pb3, 8); | |||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_PACKED_B(pb0, pb1, 0); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x1_ACC(&acc0, n+0, m+0); | |||
SAVE_4x1_ACC(&acc1, n+4, m+0); | |||
} | |||
} | |||
for (; n < n4; n += 4) { | |||
for (m = 0; m < m8; m += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
LOAD_A_1x8(k+1, m); | |||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; m < m4; m += 4) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
LOAD_A_1x4(k+1, m); | |||
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; m < m2; m += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_A_1x2(k+1, m); | |||
KERNEL_MMA_1ACC(pb1, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n, m); | |||
} | |||
for (; m < M; m++) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
KERNEL_MMA_1ACC(pb1, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x1_ACC(&acc0, n, m); | |||
} | |||
} | |||
for (; n < n2; n += 2) { | |||
for (m = 0; m < m8; m += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
LOAD_A_1x8(k+1, m); | |||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; m < m4; m += 4) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
LOAD_A_1x4(k+1, m); | |||
KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; m < m2; m += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_A_1x2(k+1, m); | |||
KERNEL_MMA_1ACC(pb1, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
} | |||
for (; m < M; m++) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
KERNEL_MMA_1ACC(pb1, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x1_ACC(&acc0, n+0, m+0); | |||
} | |||
} | |||
for (; n < N; n++) { | |||
for (m = 0; m < m8; m += 8) { | |||
vector double result = ((vector double){0.,0.}); | |||
vector double result1 = ((vector double){0.,0.}); | |||
vector double result2 = ((vector double){0.,0.}); | |||
vector double result3 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x1(n, k); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
SAVE_1x4_VSR(result2, n, m+4); | |||
SAVE_1x4_VSR(result3, n, m+6); | |||
} | |||
for (; m < m4; m += 4) { | |||
vector double result = ((vector double){0.,0.}); | |||
vector double result1 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x1(n, k); | |||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
} | |||
for (; m < m2; m += 2) { | |||
vector double result = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_B_1x1(n, k); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
} | |||
for (; m < M; m++) { | |||
FLOAT result = 0.0; | |||
for (k = 0; k < K; k++) { | |||
result += A[m+k*lda] * B[n*ldb+k]; | |||
} | |||
result = result * alpha; | |||
#if !defined(B0) | |||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
#else | |||
C[n*ldc+m] = result; | |||
#endif | |||
} | |||
} | |||
if (has_packing) free(packB); | |||
return 0; | |||
} |
@@ -0,0 +1,581 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
typedef __vector unsigned char vec_t; | |||
#if !__has_builtin(__builtin_vsx_assemble_pair) | |||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
#endif | |||
#if !defined(B0) | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#define SAVE_4x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; | |||
#else | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#define SAVE_4x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = result[0]; \ | |||
C[(N+1)*ldc+M] = result[1]; | |||
#endif | |||
#define INIT_8ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); \ | |||
__builtin_mma_xxsetaccz(&acc4); \ | |||
__builtin_mma_xxsetaccz(&acc5); \ | |||
__builtin_mma_xxsetaccz(&acc6); \ | |||
__builtin_mma_xxsetaccz(&acc7); | |||
#define INIT_4ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); | |||
#define INIT_2ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); | |||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
#define LOAD_A_1x8(K, M) \ | |||
ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
ra1 = vec_xl(0, A+(K*lda)+M+2); \ | |||
ra2 = vec_xl(0, A+(K*lda)+M+4); \ | |||
ra3 = vec_xl(0, A+(K*lda)+M+6); | |||
#define LOAD_A_1x4(K, M) \ | |||
ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
ra1 = vec_xl(0, A+(K*lda)+M+2); | |||
#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M); | |||
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]); | |||
#define LOAD_BP_1x8(K, N) \ | |||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ | |||
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); | |||
#define LOAD_BP_1x4(K, N) \ | |||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); | |||
#define LOAD_BP_1x2(K, N) \ | |||
t0 = vec_xl(0, B+(K*ldb)+N); \ | |||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); | |||
#define LOAD_B_1x8(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
rb1 = vec_xl(0, B+(K*ldb)+N+2); \ | |||
rb2 = vec_xl(0, B+(K*ldb)+N+4); \ | |||
rb3 = vec_xl(0, B+(K*ldb)+N+6); \ | |||
#define LOAD_B_1x4(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
rb1 = vec_xl(0, B+(K*ldb)+N+2); | |||
#define LOAD_B_1x2(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); | |||
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); | |||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
#define KERNEL_MMA_1ACC(b0, a0) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); \ | |||
result2 = vec_madd(a2, b2, result2); \ | |||
result3 = vec_madd(a3, b3, result3); | |||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); | |||
#define KERNEL_VMADD_1VSR(a0, b0) \ | |||
result = vec_madd(a0, b0, result); | |||
#ifdef B0 | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
#else | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG m, n, k; | |||
BLASLONG m8 = M & ~7; | |||
BLASLONG m4 = M & ~3; | |||
BLASLONG m2 = M & ~1; | |||
BLASLONG n8 = N & ~7; | |||
BLASLONG n4 = N & ~3; | |||
BLASLONG n2 = N & ~1; | |||
vector double valpha = vec_splats(alpha); | |||
#if !defined(B0) | |||
vector double vbeta = vec_splats(beta); | |||
#endif | |||
for (m = 0; m < m8; m += 8) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
INIT_8ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
SAVE_4x2_ACC(&acc4, n+0, m+4); | |||
SAVE_4x2_ACC(&acc6, n+0, m+6); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
SAVE_4x2_ACC(&acc5, n+4, m+4); | |||
SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
__vector_pair pb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double t0; | |||
__vector_pair pb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double result2 = ((vector double){0.,0.}); | |||
register vector double result3 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
SAVE_1x4_VSR(result2, n, m+4); | |||
SAVE_1x4_VSR(result3, n, m+6); | |||
} | |||
} | |||
for (; m < m4; m += 4) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1; | |||
__vector_pair pb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1; | |||
register vector double t0; | |||
__vector_pair pb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
} | |||
} | |||
for (; m < m2; m += 2) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
__vector_pair pb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double t0; | |||
__vector_pair pb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n, m); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
} | |||
} | |||
for (; m < M; m++) { | |||
for (n = 0; n < n8; n += 8) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double result2 = ((vector double){0.,0.}); | |||
register vector double result3 = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); | |||
} | |||
SAVE_4x1_VSR(result, n, m); | |||
SAVE_4x1_VSR(result1, n+2, m); | |||
SAVE_4x1_VSR(result2, n+4, m); | |||
SAVE_4x1_VSR(result3, n+6, m); | |||
} | |||
for (; n < n4; n += 4) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0, rb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); | |||
} | |||
SAVE_4x1_VSR(result, n, m); | |||
SAVE_4x1_VSR(result1, n+2, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
SAVE_4x1_VSR(result, n, m); | |||
} | |||
for (; n < N; n++) { | |||
FLOAT result = 0.0; | |||
for (k = 0; k < K; k++) { | |||
result += A[k*lda+m] * B[k*ldb+n]; | |||
} | |||
result = result * alpha; | |||
#if !defined(B0) | |||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
#else | |||
C[n*ldc+m] = result; | |||
#endif | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,882 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
typedef __vector unsigned char vec_t; | |||
#if !__has_builtin(__builtin_vsx_assemble_pair) | |||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
#endif | |||
#if !defined(B0) | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_4x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_2x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#else | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_4x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_2x1_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#endif | |||
#define INIT_8ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); \ | |||
__builtin_mma_xxsetaccz(&acc4); \ | |||
__builtin_mma_xxsetaccz(&acc5); \ | |||
__builtin_mma_xxsetaccz(&acc6); \ | |||
__builtin_mma_xxsetaccz(&acc7); | |||
#define INIT_4ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); | |||
#define INIT_2ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); | |||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
#if (defined(__GNUC__) && (__GNUC__ == 10)) | |||
#if defined(_AIX) | |||
#define LOAD_PAIR(pair, v0, v1) \ | |||
__builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
#else | |||
#define LOAD_PAIR(pair, v0, v1) \ | |||
__builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); | |||
#endif | |||
#else | |||
#define LOAD_PAIR(pair, v0, v1) \ | |||
__builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); | |||
#endif | |||
#define LOAD_AT_8x2(M, K) \ | |||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
t0 = vec_mergeh(ra0, ra1); \ | |||
t1 = vec_mergel(ra0, ra1); \ | |||
ra0 = t0; \ | |||
ra1 = t1; \ | |||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
t0 = vec_mergeh(ra2, ra3); \ | |||
t1 = vec_mergel(ra2, ra3); \ | |||
ra2 = t0; \ | |||
ra3 = t1; \ | |||
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ | |||
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ | |||
t0 = vec_mergeh(ra4, ra5); \ | |||
t1 = vec_mergel(ra4, ra5); \ | |||
ra4 = t0; \ | |||
ra5 = t1; \ | |||
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ | |||
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ | |||
t0 = vec_mergeh(ra6, ra7); \ | |||
t1 = vec_mergel(ra6, ra7); \ | |||
ra6 = t0; \ | |||
ra7 = t1; | |||
#define LOAD_AT_8x1(M, K) \ | |||
ra0 = vec_xor(ra0, ra0); \ | |||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
ra1 = vec_xor(ra1, ra1); \ | |||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
ra2 = vec_xor(ra2, ra2); \ | |||
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ | |||
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ | |||
ra3 = vec_xor(ra3, ra3); \ | |||
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ | |||
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ | |||
#define LOAD_AT_4x2(M, K) \ | |||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
t0 = vec_mergeh(ra0, ra1); \ | |||
t1 = vec_mergeh(ra2, ra3); \ | |||
t2 = vec_mergel(ra0, ra1); \ | |||
t3 = vec_mergel(ra2, ra3); \ | |||
ra0 = t0; \ | |||
ra1 = t2; \ | |||
ra2 = t1; \ | |||
ra3 = t3; | |||
#define LOAD_AT_4x1(M, K) \ | |||
ra0 = vec_xor(ra0, ra0); \ | |||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
ra1 = vec_xor(ra1, ra1); \ | |||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
#define LOAD_AT_2x2(M, K) \ | |||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
t0 = vec_mergeh(ra0, ra1); \ | |||
t1 = vec_mergel(ra0, ra1); \ | |||
ra0 = t0; \ | |||
ra1 = t1; | |||
#define LOAD_AT_2x1(M, K) \ | |||
ra0 = vec_xor(ra0, ra0); \ | |||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); | |||
#define LOAD_A_1x1(K, M) \ | |||
ra0 = vec_splats(A[((M+0)*lda)+K+0]); | |||
#define LOAD_BTP_8x2(N, K) \ | |||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb0, rb1); \ | |||
t1 = vec_mergeh(rb2, rb3); \ | |||
LOAD_PAIR(pb0, t0, t1); \ | |||
t0 = vec_mergel(rb0, rb1); \ | |||
t1 = vec_mergel(rb2, rb3); \ | |||
LOAD_PAIR(pb2, t0, t1); \ | |||
rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ | |||
rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ | |||
rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ | |||
rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb4, rb5); \ | |||
t1 = vec_mergeh(rb6, rb7); \ | |||
LOAD_PAIR(pb1, t0, t1); \ | |||
t0 = vec_mergel(rb4, rb5); \ | |||
t1 = vec_mergel(rb6, rb7); \ | |||
LOAD_PAIR(pb3, t0, t1); | |||
#define LOAD_BTP_8x1(N, K) \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
rb1 = vec_xor(rb1, rb1); \ | |||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
LOAD_PAIR(pb0, rb0, rb1); \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \ | |||
rb1 = vec_xor(rb1, rb1); \ | |||
rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \ | |||
rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \ | |||
LOAD_PAIR(pb1, rb0, rb1); | |||
#define LOAD_BTP_4x2(N, K) \ | |||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ | |||
rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb0, rb1); \ | |||
t1 = vec_mergeh(rb2, rb3); \ | |||
LOAD_PAIR(pb0, t0, t1); \ | |||
t0 = vec_mergel(rb0, rb1); \ | |||
t1 = vec_mergel(rb2, rb3); \ | |||
LOAD_PAIR(pb1, t0, t1); | |||
#define LOAD_BTP_4x1(N, K) \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
rb1 = vec_xor(rb1, rb1); \ | |||
rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ | |||
rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ | |||
LOAD_PAIR(pb0, rb0, rb1); | |||
#define LOAD_BTP_2x2(N, K) \ | |||
rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ | |||
rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ | |||
t0 = vec_mergeh(rb0, rb1); \ | |||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ | |||
t1 = vec_mergel(rb0, rb1); \ | |||
__builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); | |||
#define LOAD_BTP_2x1(N, K) \ | |||
rb0 = vec_xor(rb0, rb0); \ | |||
rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ | |||
rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ | |||
__builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); | |||
#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]); | |||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
#define KERNEL_MMA_1ACC(b0, a0) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
#define KERNEL_MMA_1ACC_(acc, b0, a0) \ | |||
__builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0); | |||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); \ | |||
result2 = vec_madd(a2, b2, result2); \ | |||
result3 = vec_madd(a3, b3, result3); | |||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); | |||
#define KERNEL_VMADD_1VSR(a0, b0) \ | |||
result = vec_madd(a0, b0, result); | |||
#ifdef B0 | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
#else | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG m, n, k; | |||
BLASLONG m8 = M & ~7; | |||
BLASLONG m4 = M & ~3; | |||
BLASLONG m2 = M & ~1; | |||
BLASLONG n8 = N & ~7; | |||
BLASLONG n4 = N & ~3; | |||
BLASLONG n2 = N & ~1; | |||
BLASLONG k2 = K & ~1; | |||
vector double valpha = vec_splats(alpha); | |||
#if !defined(B0) | |||
vector double vbeta = vec_splats(beta); | |||
#endif | |||
for (m = 0; m < m8; m += 8) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
INIT_8ACCS(); | |||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1, | |||
ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); | |||
KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3, | |||
ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); | |||
} | |||
// workaround to avoid register spilling | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_1ACC_(acc0, pb0, ra0); | |||
KERNEL_MMA_1ACC_(acc1, pb0, ra1); | |||
LOAD_AT_4x1(m+4, k); | |||
KERNEL_MMA_1ACC_(acc2, pb0, ra0); | |||
KERNEL_MMA_1ACC_(acc3, pb0, ra1); | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BTP_4x1(n+4, k); | |||
KERNEL_MMA_1ACC_(acc4, pb0, ra0); | |||
KERNEL_MMA_1ACC_(acc5, pb0, ra1); | |||
LOAD_AT_4x1(m+4, k); | |||
KERNEL_MMA_1ACC_(acc6, pb0, ra0); | |||
KERNEL_MMA_1ACC_(acc7, pb0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
SAVE_4x2_ACC(&acc4, n+4, m+0); | |||
SAVE_4x2_ACC(&acc6, n+4, m+4); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
SAVE_4x2_ACC(&acc5, n+4, m+2); | |||
SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double result2 = ((vector double){0.,0.}); | |||
register vector double result3 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_B_1x1(n, k); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
SAVE_1x4_VSR(result2, n, m+4); | |||
SAVE_1x4_VSR(result3, n, m+6); | |||
} | |||
} | |||
for (; m < m4; m += 4) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_4x2(m, k); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2); | |||
KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
SAVE_4x2_ACC(&acc2, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_4x2(m, k); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_4x2(m, k); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_B_1x1(n, k); | |||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
} | |||
} | |||
for (; m < m2; m += 2) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_2x2(m, k); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0, ra1; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_2x2(m, k); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
KERNEL_MMA_1ACC(pb1, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0, ra1; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_2x2(m, k); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
KERNEL_MMA_1ACC(pb1, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n, m); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double ra0, ra1; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_B_1x1(n, k); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
} | |||
} | |||
for (; m < M; m++) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1, pb2, pb3; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_8x2(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_8x1(n, k); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x1_ACC(&acc0, n+0, m+0); | |||
SAVE_4x1_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_4x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
KERNEL_MMA_1ACC(pb1, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_4x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x1_ACC(&acc0, n, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0; | |||
register vector double rb0, rb1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_2x2(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_A_1x1(k+1, m); | |||
KERNEL_MMA_1ACC(pb1, ra0); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_BTP_2x1(n, k); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x1_ACC(&acc0, n+0, m+0); | |||
} | |||
for (; n < N; n++) { | |||
FLOAT result = 0.0; | |||
for (k = 0; k < K; k++) { | |||
result += A[m*lda+k] * B[n*ldb+k]; | |||
} | |||
result = result * alpha; | |||
#if !defined(B0) | |||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
#else | |||
C[n*ldc+m] = result; | |||
#endif | |||
} | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,829 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
typedef __vector unsigned char vec_t; | |||
#if !__has_builtin(__builtin_vsx_assemble_pair) | |||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair | |||
#endif | |||
#if !defined(B0) | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#define SAVE_4x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; | |||
#else | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_2x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#define SAVE_4x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = result[0]; \ | |||
C[(N+1)*ldc+M] = result[1]; | |||
#endif | |||
#define INIT_8ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); \ | |||
__builtin_mma_xxsetaccz(&acc4); \ | |||
__builtin_mma_xxsetaccz(&acc5); \ | |||
__builtin_mma_xxsetaccz(&acc6); \ | |||
__builtin_mma_xxsetaccz(&acc7); | |||
#define INIT_4ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); | |||
#define INIT_2ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); | |||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
#define LOAD_AT_8x2(M, K) \ | |||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
t0 = vec_mergeh(ra0, ra1); \ | |||
t1 = vec_mergeh(ra2, ra3); \ | |||
t2 = vec_mergel(ra0, ra1); \ | |||
t3 = vec_mergel(ra2, ra3); \ | |||
ra0 = t0; \ | |||
ra1 = t2; \ | |||
ra2 = t1; \ | |||
ra3 = t3; \ | |||
ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ | |||
ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ | |||
ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ | |||
ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ | |||
t0 = vec_mergeh(ra4, ra5); \ | |||
t1 = vec_mergeh(ra6, ra7); \ | |||
t2 = vec_mergel(ra4, ra5); \ | |||
t3 = vec_mergel(ra6, ra7); \ | |||
ra4 = t0; \ | |||
ra5 = t2; \ | |||
ra6 = t1; \ | |||
ra7 = t3; | |||
#define LOAD_AT_8x1(M, K) \ | |||
ra0 = vec_xor(ra0, ra0); \ | |||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
ra1 = vec_xor(ra1, ra1); \ | |||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
ra2 = vec_xor(ra2, ra2); \ | |||
ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ | |||
ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ | |||
ra3 = vec_xor(ra3, ra3); \ | |||
ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ | |||
ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ | |||
#define LOAD_AT_4x2(M, K) \ | |||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ | |||
ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ | |||
t0 = vec_mergeh(ra0, ra1); \ | |||
t1 = vec_mergeh(ra2, ra3); \ | |||
t2 = vec_mergel(ra0, ra1); \ | |||
t3 = vec_mergel(ra2, ra3); \ | |||
ra0 = t0; \ | |||
ra1 = t2; \ | |||
ra2 = t1; \ | |||
ra3 = t3; | |||
#define LOAD_AT_4x1(M, K) \ | |||
ra0 = vec_xor(ra0, ra0); \ | |||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ | |||
ra1 = vec_xor(ra1, ra1); \ | |||
ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ | |||
ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ | |||
#define LOAD_AT_2x2(M, K) \ | |||
ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ | |||
ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ | |||
t0 = vec_mergeh(ra0, ra1); \ | |||
t1 = vec_mergel(ra0, ra1); \ | |||
ra0 = t0; \ | |||
ra1 = t1; | |||
#define LOAD_AT_2x1(M, K) \ | |||
ra0 = vec_xor(ra0, ra0); \ | |||
ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ | |||
ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); | |||
#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); | |||
#define LOAD_BP_1x8(K, N) \ | |||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ | |||
pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); | |||
#define LOAD_BP_1x4(K, N) \ | |||
pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); | |||
#define LOAD_BP_1x2(K, N) \ | |||
t0 = vec_xl(0, B+((K)*ldb)+N); \ | |||
__builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); | |||
#define LOAD_B_1x8(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
rb1 = vec_xl(0, B+(K*ldb)+N+2); \ | |||
rb2 = vec_xl(0, B+(K*ldb)+N+4); \ | |||
rb3 = vec_xl(0, B+(K*ldb)+N+6); \ | |||
#define LOAD_B_1x4(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
rb1 = vec_xl(0, B+(K*ldb)+N+2); | |||
#define LOAD_B_1x2(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); | |||
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); | |||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ | |||
__builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ | |||
__builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ | |||
__builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ | |||
__builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); | |||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ | |||
__builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ | |||
__builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); | |||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ | |||
__builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); | |||
#define KERNEL_MMA_1ACC(b0, a0) \ | |||
__builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); | |||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); \ | |||
result2 = vec_madd(a2, b2, result2); \ | |||
result3 = vec_madd(a3, b3, result3); | |||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); | |||
#define KERNEL_VMADD_1VSR(a0, b0) \ | |||
result = vec_madd(a0, b0, result); | |||
#define PACK_A(ra0, ra1, ra2, ra3, offset) \ | |||
vec_xst(ra0, 0, packA+(k*8)+0+offset); \ | |||
vec_xst(ra1, 0, packA+(k*8)+2+offset); \ | |||
vec_xst(ra2, 0, packA+(k*8)+4+offset); \ | |||
vec_xst(ra3, 0, packA+(k*8)+6+offset); | |||
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ | |||
ra0 = vec_xl(0, packA+(k*8)+0+offset); \ | |||
ra1 = vec_xl(0, packA+(k*8)+2+offset); \ | |||
ra2 = vec_xl(0, packA+(k*8)+4+offset); \ | |||
ra3 = vec_xl(0, packA+(k*8)+6+offset); | |||
#ifdef B0 | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
#else | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG m, n, k; | |||
BLASLONG m8 = M & ~7; | |||
BLASLONG m4 = M & ~3; | |||
BLASLONG m2 = M & ~1; | |||
BLASLONG n8 = N & ~7; | |||
BLASLONG n4 = N & ~3; | |||
BLASLONG n2 = N & ~1; | |||
BLASLONG k2 = K & ~1; | |||
#if defined(__GNUC__) && !defined(__clang__) | |||
int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; | |||
#else | |||
int has_packing = 0; | |||
#endif | |||
double *packA; | |||
if (has_packing) packA = (double *)malloc(K*8*sizeof(double)); | |||
vector double valpha = vec_splats(alpha); | |||
#if !defined(B0) | |||
vector double vbeta = vec_splats(beta); | |||
#endif | |||
for (m = 0; m < m8; m += 8) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
INIT_8ACCS(); | |||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0, pb1; | |||
if (has_packing) { | |||
if (n == 0) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); | |||
PACK_A(ra0, ra2, ra4, ra6, 0); | |||
LOAD_BP_1x8(k+1, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); | |||
PACK_A(ra1, ra3, ra5, ra7, 8); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
PACK_A(ra0, ra1, ra2, ra3, 0); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); | |||
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); | |||
LOAD_BP_1x8(k+1, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); | |||
LOAD_BP_1x8(k+1, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
SAVE_4x2_ACC(&acc4, n+0, m+4); | |||
SAVE_4x2_ACC(&acc6, n+0, m+6); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
SAVE_4x2_ACC(&acc5, n+4, m+4); | |||
SAVE_4x2_ACC(&acc7, n+4, m+6); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0; | |||
if (!has_packing) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
LOAD_BP_1x4(k+1, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); | |||
LOAD_BP_1x4(k+1, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
SAVE_4x2_ACC(&acc2, n+0, m+4); | |||
SAVE_4x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0; | |||
if (!has_packing) { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_8x2(m, k); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
LOAD_BP_1x2(k+1, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
} else { | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); | |||
LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); | |||
LOAD_BP_1x2(k+1, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
SAVE_2x2_ACC(&acc2, n+0, m+4); | |||
SAVE_2x2_ACC(&acc3, n+0, m+6); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double result2 = ((vector double){0.,0.}); | |||
register vector double result3 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double rb0; | |||
if (!has_packing) { | |||
for (k = 0; k < K; k++) { | |||
LOAD_AT_8x1(m, k); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
} else { | |||
for (k = 0; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
SAVE_1x4_VSR(result2, n, m+4); | |||
SAVE_1x4_VSR(result3, n, m+6); | |||
} | |||
} | |||
for (; m < m4; m += 4) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_4x2(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2); | |||
LOAD_BP_1x8(k+1, n); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc2, n+0, m+2); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc3, n+4, m+2); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_4x2(m, k); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
LOAD_BP_1x4(k+1, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1, ra2, ra3; | |||
register vector double t0, t1, t2, t3; | |||
__vector_pair pb0; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_4x2(m, k); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); | |||
LOAD_BP_1x2(k+1, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n+0, m+0); | |||
SAVE_2x2_ACC(&acc1, n+0, m+2); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double ra0, ra1; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_AT_4x1(m, k); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
SAVE_1x4_VSR(result1, n, m+2); | |||
} | |||
} | |||
for (; m < m2; m += 2) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector double ra0, ra1; | |||
register vector double t0, t1; | |||
__vector_pair pb0, pb1; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_2x2(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
LOAD_BP_1x8(k+1, n); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_BP_1x8(k, n); | |||
KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0, ra1; | |||
register vector double t0, t1; | |||
__vector_pair pb0; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_2x2(m, k); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_BP_1x4(k+1, n); | |||
KERNEL_MMA_1ACC(pb0, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_BP_1x4(k, n); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_4x2_ACC(&acc0, n, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector double ra0, ra1; | |||
register vector double t0, t1; | |||
__vector_pair pb0; | |||
for (k = 0; k < k2; k += 2) { | |||
LOAD_AT_2x2(m, k); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
LOAD_BP_1x2(k+1, n); | |||
KERNEL_MMA_1ACC(pb0, ra1); | |||
} | |||
for (; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_BP_1x2(k, n); | |||
KERNEL_MMA_1ACC(pb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
vector double result[4]; | |||
SAVE_2x2_ACC(&acc0, n, m); | |||
} | |||
for (; n < N; n++) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_AT_2x1(m, k); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector double rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m+0); | |||
} | |||
} | |||
for (; m < M; m++) { | |||
for (n = 0; n < n8; n += 8) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double result2 = ((vector double){0.,0.}); | |||
register vector double result3 = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0, rb1, rb2, rb3; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(m, k); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); | |||
} | |||
SAVE_4x1_VSR(result, n, m); | |||
SAVE_4x1_VSR(result1, n+2, m); | |||
SAVE_4x1_VSR(result2, n+4, m); | |||
SAVE_4x1_VSR(result3, n+6, m); | |||
} | |||
for (; n < n4; n += 4) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double result1 = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0, rb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(m, k); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); | |||
} | |||
SAVE_4x1_VSR(result, n, m); | |||
SAVE_4x1_VSR(result1, n+2, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
register vector double result = ((vector double){0.,0.}); | |||
register vector double ra0; | |||
register vector double rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(m, k); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
SAVE_4x1_VSR(result, n, m); | |||
} | |||
for (; n < N; n++) { | |||
FLOAT result = 0.0; | |||
for (k = 0; k < K; k++) { | |||
result += A[m*lda+k] * B[k*ldb+n]; | |||
} | |||
result = result * alpha; | |||
#if !defined(B0) | |||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
#else | |||
C[n*ldc+m] = result; | |||
#endif | |||
} | |||
} | |||
if(has_packing) free(packA); | |||
return 0; | |||
} |
@@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y | |||
XXSPLTD_S(32,%x9,0) // alpha, alpha | |||
"sldi %6, %13, 3 \n\t" // lda * sizeof (double) | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha | |||
"xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha | |||
#else | |||
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha | |||
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha | |||
#endif | |||
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda | |||
"add %6, %6, %6 \n\t" // 2 * lda | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
#else | |||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
#endif | |||
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda | |||
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda | |||
@@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda | |||
"add %10, %10, %10 \n\t" // 2 * lda | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha | |||
XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha | |||
XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha | |||
XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha | |||
#else | |||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
@@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha | |||
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha | |||
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha | |||
#endif | |||
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda | |||
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda | |||
@@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
"one%=: \n\t" | |||
"lxvp 36, 0( %2) \n\t" // y0, y1 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 40, 32 \n\t" | |||
"xvmaddadp 37, 41, 32 \n\t" | |||
#else | |||
"xvmaddadp 36, 40, 34 \n\t" | |||
"xvmaddadp 37, 41, 34 \n\t" | |||
#endif | |||
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 42, 33 \n\t" | |||
"xvmaddadp 37, 43, 33 \n\t" | |||
#else | |||
"xvmaddadp 36, 42, 35 \n\t" | |||
"xvmaddadp 37, 43, 35 \n\t" | |||
#endif | |||
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 44, 34 \n\t" | |||
"xvmaddadp 37, 45, 34 \n\t" | |||
#else | |||
"xvmaddadp 36, 44, 32 \n\t" | |||
"xvmaddadp 37, 45, 32 \n\t" | |||
#endif | |||
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 46, 35 \n\t" | |||
"xvmaddadp 37, 47, 35 \n\t" | |||
#else | |||
"xvmaddadp 36, 46, 33 \n\t" | |||
"xvmaddadp 37, 47, 33 \n\t" | |||
#endif | |||
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 50, 38 \n\t" | |||
"xvmaddadp 37, 51, 38 \n\t" | |||
#else | |||
"xvmaddadp 36, 50, 48 \n\t" | |||
"xvmaddadp 37, 51, 48 \n\t" | |||
#endif | |||
"lxvpx 50, %7, %11 \n\t" // a4[0] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 52, 39 \n\t" | |||
"xvmaddadp 37, 53, 39 \n\t" | |||
#else | |||
"xvmaddadp 36, 52, 49 \n\t" | |||
"xvmaddadp 37, 53, 49 \n\t" | |||
#endif | |||
"lxvpx 52, %8, %11 \n\t" // a5[0] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 54, 48 \n\t" | |||
"xvmaddadp 37, 55, 48 \n\t" | |||
#else | |||
"xvmaddadp 36, 54, 38 \n\t" | |||
"xvmaddadp 37, 55, 38 \n\t" | |||
#endif | |||
"lxvpx 54, %9, %11 \n\t" // a6[0] | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 56, 49 \n\t" | |||
"xvmaddadp 37, 57, 49 \n\t" | |||
#else | |||
"xvmaddadp 36, 56, 39 \n\t" | |||
"xvmaddadp 37, 57, 39 \n\t" | |||
#endif | |||
"lxvpx 56, %10, %11 \n\t" // a7[0] | |||
"addi %11, %11, 32 \n\t" | |||
@@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
"two%=: \n\t" | |||
"lxvp 36, 0( %2) \n\t" // y0, y1 | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 36, 40, 32 \n\t" | |||
"xvmaddadp 37, 41, 32 \n\t" | |||
"xvmaddadp 36, 42, 33 \n\t" | |||
"xvmaddadp 37, 43, 33 \n\t" | |||
"xvmaddadp 36, 44, 34 \n\t" | |||
"xvmaddadp 37, 45, 34 \n\t" | |||
"xvmaddadp 36, 46, 35 \n\t" | |||
"xvmaddadp 37, 47, 35 \n\t" | |||
"xvmaddadp 36, 50, 38 \n\t" | |||
"xvmaddadp 37, 51, 38 \n\t" | |||
"xvmaddadp 36, 52, 39 \n\t" | |||
"xvmaddadp 37, 53, 39 \n\t" | |||
"xvmaddadp 36, 54, 48 \n\t" | |||
"xvmaddadp 37, 55, 48 \n\t" | |||
"xvmaddadp 36, 56, 49 \n\t" | |||
"xvmaddadp 37, 57, 49 \n\t" | |||
#else | |||
"xvmaddadp 36, 40, 34 \n\t" | |||
"xvmaddadp 37, 41, 34 \n\t" | |||
"xvmaddadp 36, 42, 35 \n\t" | |||
@@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
"xvmaddadp 37, 55, 38 \n\t" | |||
"xvmaddadp 36, 56, 39 \n\t" | |||
"xvmaddadp 37, 57, 39 \n\t" | |||
#endif | |||
"stxvp 36, 0( %2) \n\t" // y0, y1 | |||
: | |||
@@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||
"lxvp 40, 32(%[y]) \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
XXMRGHD_S(42,34,35) | |||
XXMRGLD_S(43,34,35) | |||
XXMRGHD_S(44,4,5) | |||
XXMRGLD_S(45,4,5) | |||
#else | |||
XXMRGLD_S(42,35,34) | |||
XXMRGHD_S(43,35,34) | |||
XXMRGLD_S(44,5,4) | |||
XXMRGHD_S(45,5,4) | |||
#endif | |||
"xvadddp 42,42,43 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
XXMRGHD_S(46,6,7) | |||
XXMRGLD_S(47,6,7) | |||
#else | |||
XXMRGLD_S(46,7,6) | |||
XXMRGHD_S(47,7,6) | |||
#endif | |||
"xvadddp 44,44,45 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
XXMRGHD_S(48,8,9) | |||
XXMRGLD_S(49,8,9) | |||
#else | |||
XXMRGLD_S(48,9,8) | |||
XXMRGHD_S(49,9,8) | |||
#endif | |||
"xvadddp 46,46,47 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 38,42,36 \n\t" | |||
"xvmaddadp 39,44,36 \n\t" | |||
#else | |||
"xvmaddadp 39,42,36 \n\t" | |||
"xvmaddadp 38,44,36 \n\t" | |||
#endif | |||
"xvadddp 48,48,49 \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 41,48,36 \n\t" | |||
#else | |||
"xvmaddadp 41,46,36 \n\t" | |||
#endif | |||
"stxvp 38, 0(%[y]) \n\t" | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
"xvmaddadp 40,46,36 \n\t" | |||
#else | |||
"xvmaddadp 40,48,36 \n\t" | |||
#endif | |||
"stxvp 40, 32(%[y]) \n\t" | |||
: [memy] "+m" (*(double (*)[8])y), | |||
@@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "drot_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "drot_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "drot_microk_power8.c" | |||
#include "drot_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 16 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "dscal_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "dscal_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "dscal_microk_power8.c" | |||
#include "dscal_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
if ( da == 0.0 ) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 16 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
@@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
else | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 16 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "dswap_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "swap_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "dswap_microk_power8.c" | |||
#include "swap_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
if ( (inc_x == 1) && (inc_y == 1 )) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 32 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
@@ -0,0 +1,84 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) | |||
{ | |||
double MNK = (double) M * (double) N * (double) K; | |||
#if defined(DOUBLE) // dgemm | |||
// gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This | |||
// issue affects both dgemm_nn and dgemm_tn. | |||
#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)) | |||
if (!transb) | |||
return 0; | |||
#endif | |||
if (MNK <= 54.0*54.0*54.0) | |||
return 1; | |||
#else // sgemm | |||
#if defined(__GNUC__) && defined(__clang__) | |||
// clang generates code with register spilling for the region of code with | |||
// packing, thus, we had to disable this optimization for clang. Given that | |||
// the packing on-demand used in this work is one of the reasons that lead the | |||
// small kernels to outperform the normal flow (when MNK increases), with it | |||
// disabled we had to reduce the MNK inputs used by the code generated by clang. | |||
if (MNK > 84.0*84.0*84.0) | |||
return 0; | |||
if (transa && !transb) { | |||
// sgemm_tn works better when packing on-demand is used | |||
if (MNK <= 64.0*64.0*64.0 && K >= 4) | |||
return 1; | |||
else | |||
return 0; | |||
} | |||
#else // gcc | |||
if (MNK > 100.0*100.0*100.0) | |||
return 0; | |||
#endif | |||
// Multi-threading execution outperforms (or approaches) the execution of the | |||
// small kernel. | |||
if (num_cpu_avail(3) > 1) { | |||
if (MNK <= 64.0*64.0*64.0) | |||
return 1; | |||
} else { | |||
return 1; | |||
} | |||
#endif | |||
return 0; | |||
} |
@@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "sasum_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "sasum_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "sasum_microk_power8.c" | |||
#include "sasum_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
if ( inc_x == 1 ) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 32 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
@@ -0,0 +1,887 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2021, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
typedef __vector unsigned char vec_t; | |||
#if !defined(B0) | |||
#define SAVE_4x4_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+3)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[2] = vec_madd(result[2], valpha, rc0); \ | |||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[3] = vec_madd(result[3], valpha, rc0); \ | |||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
#define SAVE_2x4_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
rc0 = vec_xl(0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[0] = vec_madd(result[0], valpha, rc0); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
rc0 = vec_xl(0, C+(N+1)*ldc+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result[1] = vec_madd(result[1], valpha, rc0); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
rc0 = vec_xl(0, C+((N)*ldc)+M); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#define SAVE_2x2_VSR(result, N, M) \ | |||
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ | |||
rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ | |||
rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst_len(result, C+(N*ldc)+M, 8); \ | |||
C[(N+1)*ldc+M+0] = result[2]; \ | |||
C[(N+1)*ldc+M+1] = result[3]; | |||
#define SAVE_1x2_VSR(result, N, M) \ | |||
rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ | |||
rc0 = vec_mul(rc0, vbeta); \ | |||
result = vec_madd(result, valpha, rc0); \ | |||
vec_xst_len(result, C+(N*ldc)+M, 8); | |||
#define SAVE_4x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ | |||
C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ | |||
C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; | |||
#define SAVE_2x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ | |||
C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; | |||
#else | |||
#define SAVE_4x4_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst(result[2], 0, C+(N+2)*ldc+M); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst(result[3], 0, C+(N+3)*ldc+M); | |||
#define SAVE_4x2_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ | |||
result[2] = vec_mul(result[2], valpha); \ | |||
vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ | |||
result[3] = vec_mul(result[3], valpha); \ | |||
vec_xst_len(result[3], C+(N+3)*ldc+M, 8); | |||
#define SAVE_2x4_ACC(ACC, N, M) \ | |||
__builtin_mma_disassemble_acc((void *)result, ACC); \ | |||
result[0] = vec_mul(result[0], valpha); \ | |||
vec_xst(result[0], 0, C+(N+0)*ldc+M); \ | |||
result[1] = vec_mul(result[1], valpha); \ | |||
vec_xst(result[1], 0, C+(N+1)*ldc+M); | |||
#define SAVE_1x4_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst(result, 0, C+((N)*ldc)+M); | |||
#define SAVE_2x2_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst_len(result, C+(N*ldc)+M, 8); \ | |||
C[(N+1)*ldc+M+0] = result[2]; \ | |||
C[(N+1)*ldc+M+1] = result[3]; | |||
#define SAVE_1x2_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
vec_xst_len(result, C+(N*ldc)+M, 8); | |||
#define SAVE_4x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = result[0]; \ | |||
C[(N+1)*ldc+M] = result[1]; \ | |||
C[(N+2)*ldc+M] = result[2]; \ | |||
C[(N+3)*ldc+M] = result[3]; | |||
#define SAVE_2x1_VSR(result, N, M) \ | |||
result = vec_mul(result, valpha); \ | |||
C[(N+0)*ldc+M] = result[0]; \ | |||
C[(N+1)*ldc+M] = result[1]; | |||
#endif | |||
#define INIT_8ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); \ | |||
__builtin_mma_xxsetaccz(&acc4); \ | |||
__builtin_mma_xxsetaccz(&acc5); \ | |||
__builtin_mma_xxsetaccz(&acc6); \ | |||
__builtin_mma_xxsetaccz(&acc7); | |||
#define INIT_4ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); \ | |||
__builtin_mma_xxsetaccz(&acc2); \ | |||
__builtin_mma_xxsetaccz(&acc3); | |||
#define INIT_2ACCS() \ | |||
__builtin_mma_xxsetaccz(&acc0); \ | |||
__builtin_mma_xxsetaccz(&acc1); | |||
#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); | |||
#define LOAD_A_1x16(K, M) \ | |||
ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
ra1 = vec_xl(0, A+(K*lda)+M+4); \ | |||
ra2 = vec_xl(0, A+(K*lda)+M+8); \ | |||
ra3 = vec_xl(0, A+(K*lda)+M+12); | |||
#define LOAD_A_1x8(K, M) \ | |||
ra0 = vec_xl(0, A+(K*lda)+M+0); \ | |||
ra1 = vec_xl(0, A+(K*lda)+M+4); | |||
#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M); | |||
#define LOAD_A_2x2(K, M) \ | |||
ra0 = vec_splats(A[K*lda+M+0]); \ | |||
ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ | |||
ra0 = vec_insert(A[K*lda+M+1], ra0, 3); | |||
#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8); | |||
#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]); | |||
#define LOAD_B_1x16(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
rb1 = vec_xl(0, B+(K*ldb)+N+4); \ | |||
rb2 = vec_xl(0, B+(K*ldb)+N+8); \ | |||
rb3 = vec_xl(0, B+(K*ldb)+N+12); | |||
#define LOAD_B_1x8(K, N) \ | |||
rb0 = vec_xl(0, B+(K*ldb)+N+0); \ | |||
rb1 = vec_xl(0, B+(K*ldb)+N+4); | |||
#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N); | |||
#define LOAD_B_2x2(K, N) \ | |||
rb0 = vec_splats(B[K*ldb+N]); \ | |||
rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ | |||
rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); | |||
#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8); | |||
#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); | |||
#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ | |||
a0, a1, a2, a3, a4, a5, a6, a7) \ | |||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ | |||
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ | |||
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ | |||
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ | |||
__builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ | |||
__builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ | |||
__builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ | |||
__builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); | |||
#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ | |||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ | |||
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ | |||
__builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ | |||
__builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); | |||
#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ | |||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ | |||
__builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); | |||
#define KERNEL_MMA_1ACC(b0, a0) \ | |||
__builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); | |||
#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); \ | |||
result2 = vec_madd(a2, b2, result2); \ | |||
result3 = vec_madd(a3, b3, result3); | |||
#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ | |||
result = vec_madd(a0, b0, result); \ | |||
result1 = vec_madd(a1, b1, result1); | |||
#define KERNEL_VMADD_1VSR(a0, b0) \ | |||
result = vec_madd(a0, b0, result); | |||
#define PACK_A(ra0, ra1, ra2, ra3, offset) \ | |||
vec_xst(ra0, 0, packA+(k*16)+0+offset); \ | |||
vec_xst(ra1, 0, packA+(k*16)+4+offset); \ | |||
vec_xst(ra2, 0, packA+(k*16)+8+offset); \ | |||
vec_xst(ra3, 0, packA+(k*16)+12+offset); | |||
#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ | |||
ra0 = vec_xl(0, packA+(k*16)+0+offset); \ | |||
ra1 = vec_xl(0, packA+(k*16)+4+offset); \ | |||
ra2 = vec_xl(0, packA+(k*16)+8+offset); \ | |||
ra3 = vec_xl(0, packA+(k*16)+12+offset); | |||
#ifdef B0 | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||
#else | |||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||
#endif | |||
{ | |||
BLASLONG m, n, k; | |||
BLASLONG m16 = M & ~15; | |||
BLASLONG m8 = M & ~7; | |||
BLASLONG m4 = M & ~3; | |||
BLASLONG m2 = M & ~1; | |||
BLASLONG n16 = N & ~15; | |||
BLASLONG n8 = N & ~7; | |||
BLASLONG n4 = N & ~3; | |||
BLASLONG n2 = N & ~1; | |||
vector float valpha = vec_splats(alpha); | |||
#if !defined(B0) | |||
vector float vbeta = vec_splats(beta); | |||
#endif | |||
#if defined(__GNUC__) && !defined(__clang__) | |||
int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0; | |||
#else | |||
int has_packing = 0; | |||
#endif | |||
float *packA; | |||
if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); | |||
for (m = 0; m < m16; m += 16) { | |||
for (n = 0; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
INIT_8ACCS(); | |||
register vector float ra0, ra1, ra2, ra3; | |||
register vector float rb0, rb1; | |||
if (has_packing) { | |||
if (n == 0) { | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x16(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
PACK_A(ra0, ra1, ra2, ra3, 0); | |||
} | |||
} else { | |||
for (k = 0; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
} | |||
} else { | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x16(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, | |||
ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc2, n+0, m+4); | |||
SAVE_4x4_ACC(&acc4, n+0, m+8); | |||
SAVE_4x4_ACC(&acc6, n+0, m+12); | |||
SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
SAVE_4x4_ACC(&acc3, n+4, m+4); | |||
SAVE_4x4_ACC(&acc5, n+4, m+8); | |||
SAVE_4x4_ACC(&acc7, n+4, m+12); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector float ra0, ra1, ra2, ra3; | |||
register vector float rb0; | |||
if (!has_packing) { | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x16(k, m); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
} | |||
} else { | |||
for (k = 0; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc1, n+0, m+4); | |||
SAVE_4x4_ACC(&acc2, n+0, m+8); | |||
SAVE_4x4_ACC(&acc3, n+0, m+12); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector float ra0, ra1, ra2, ra3; | |||
register vector float rb0; | |||
if (!has_packing) { | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x16(k, m); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
} | |||
} else { | |||
for (k = 0; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_2x4_ACC(&acc0, n, m+0); | |||
SAVE_2x4_ACC(&acc1, n, m+4); | |||
SAVE_2x4_ACC(&acc2, n, m+8); | |||
SAVE_2x4_ACC(&acc3, n, m+12); | |||
} | |||
for (; n < N; n++) { | |||
vector float result = ((vector float){0., 0., 0., 0.}); | |||
vector float result1 = ((vector float){0., 0., 0., 0.}); | |||
vector float result2 = ((vector float){0., 0., 0., 0.}); | |||
vector float result3 = ((vector float){0., 0., 0., 0.}); | |||
register vector float ra0, ra1, ra2, ra3; | |||
register vector float rb0; | |||
if (!has_packing) { | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x16(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
} else { | |||
for (k = 0; k < K; k++) { | |||
LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); | |||
} | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m); | |||
SAVE_1x4_VSR(result1, n, m+4); | |||
SAVE_1x4_VSR(result2, n, m+8); | |||
SAVE_1x4_VSR(result3, n, m+12); | |||
} | |||
} | |||
for (; m < m8; m += 8) { | |||
for (n = 0; n < n16; n += 16) { | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
INIT_8ACCS(); | |||
register vector float ra0, ra1; | |||
register vector float rb0, rb1, rb2, rb3; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x16(k, n); | |||
KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, | |||
ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc4, n+0, m+4); | |||
SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
SAVE_4x4_ACC(&acc5, n+4, m+4); | |||
SAVE_4x4_ACC(&acc2, n+8, m+0); | |||
SAVE_4x4_ACC(&acc6, n+8, m+4); | |||
SAVE_4x4_ACC(&acc3, n+12, m+0); | |||
SAVE_4x4_ACC(&acc7, n+12, m+4); | |||
} | |||
for (; n < n8; n += 8) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector float ra0, ra1; | |||
register vector float rb0, rb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc2, n+0, m+4); | |||
SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
SAVE_4x4_ACC(&acc3, n+4, m+4); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector float ra0, ra1; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc1, n+0, m+4); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector float ra0, ra1; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_2x4_ACC(&acc0, n, m+0); | |||
SAVE_2x4_ACC(&acc1, n, m+4); | |||
} | |||
for (; n < N; n++) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
vector float result1 = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0, ra1; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x8(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m); | |||
SAVE_1x4_VSR(result1, n, m+4); | |||
} | |||
} | |||
for (; m < m4; m += 4) { | |||
for (n = 0; n < n16; n += 16) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector float ra0; | |||
register vector float rb0, rb1, rb2, rb3; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x16(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
SAVE_4x4_ACC(&acc2, n+8, m+0); | |||
SAVE_4x4_ACC(&acc3, n+12, m+0); | |||
} | |||
for (; n < n8; n += 8) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector float ra0; | |||
register vector float rb0, rb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
SAVE_4x4_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_MMA_1ACC(rb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x4_ACC(&acc0, n+0, m+0); | |||
} | |||
for (; n < n2; n += 2) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_MMA_1ACC(rb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_2x4_ACC(&acc0, n, m); | |||
} | |||
for (; n < N; n++) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x4(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
SAVE_1x4_VSR(result, n, m); | |||
} | |||
} | |||
for (; m < m2; m += 2) { | |||
for (n = 0; n < n16; n += 16) { | |||
__vector_quad acc0, acc1, acc2, acc3; | |||
INIT_4ACCS(); | |||
register vector float ra0; | |||
register vector float rb0, rb1, rb2, rb3; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_B_1x16(k, n); | |||
KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
SAVE_4x2_ACC(&acc2, n+8, m+0); | |||
SAVE_4x2_ACC(&acc3, n+12, m+0); | |||
} | |||
for (; n < n8; n += 8) { | |||
__vector_quad acc0, acc1; | |||
INIT_2ACCS(); | |||
register vector float ra0; | |||
register vector float rb0, rb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
SAVE_4x2_ACC(&acc1, n+4, m+0); | |||
} | |||
for (; n < n4; n += 4) { | |||
__vector_quad acc0; | |||
INIT_1ACC(); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_MMA_1ACC(rb0, ra0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
vector float result[4]; | |||
SAVE_4x2_ACC(&acc0, n+0, m+0); | |||
} | |||
for (; n < n2; n += 2) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_2x2(k, m); | |||
LOAD_B_2x2(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
SAVE_2x2_VSR(result, n, m); | |||
} | |||
for (; n < N; n++) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x2(k, m); | |||
LOAD_B_1x1(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
#if !defined(B0) | |||
register vector float rc0; | |||
#endif | |||
SAVE_1x2_VSR(result, n, m); | |||
} | |||
} | |||
for (; m < M; m++) { | |||
for (n = 0; n < n16; n += 16) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
vector float result1 = ((vector float){0.,0.,0.,0.}); | |||
vector float result2 = ((vector float){0.,0.,0.,0.}); | |||
vector float result3 = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0, rb1, rb2, rb3; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x16(k, n); | |||
KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); | |||
} | |||
SAVE_4x1_VSR(result, n+0, m); | |||
SAVE_4x1_VSR(result1, n+4, m); | |||
SAVE_4x1_VSR(result2, n+8, m); | |||
SAVE_4x1_VSR(result3, n+12, m); | |||
} | |||
for (; n < n8; n += 8) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
vector float result1 = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0, rb1; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x8(k, n); | |||
KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); | |||
} | |||
SAVE_4x1_VSR(result, n+0, m); | |||
SAVE_4x1_VSR(result1, n+4, m); | |||
} | |||
for (; n < n4; n += 4) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x4(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
SAVE_4x1_VSR(result, n+0, m); | |||
} | |||
for (; n < n2; n += 2) { | |||
vector float result = ((vector float){0.,0.,0.,0.}); | |||
register vector float ra0; | |||
register vector float rb0; | |||
for (k = 0; k < K; k++) { | |||
LOAD_A_1x1(k, m); | |||
LOAD_B_1x2(k, n); | |||
KERNEL_VMADD_1VSR(ra0, rb0); | |||
} | |||
SAVE_2x1_VSR(result, n+0, m); | |||
} | |||
for (; n < N; n++) { | |||
FLOAT result = 0.0f; | |||
for (k = 0; k < K; k++) { | |||
result += A[k*lda+m] * B[k*ldb+n]; | |||
} | |||
result = result * alpha; | |||
#if !defined(B0) | |||
C[n*ldc+m] = (C[n*ldc+m] * beta) + result; | |||
#else | |||
C[n*ldc+m] = result; | |||
#endif | |||
} | |||
} | |||
if (has_packing) free (packA); | |||
return 0; | |||
} |
@@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "srot_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "srot_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "srot_microk_power8.c" | |||
#include "srot_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 16 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "sscal_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "sscal_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "sscal_microk_power8.c" | |||
#include "sscal_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
if ( da == 0.0 ) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 32 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
@@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
else | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 32 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
@@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#if defined(POWER8) || defined(POWER9) | |||
#include "sswap_microk_power8.c" | |||
#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#include "swap_microk_power10.c" | |||
#elif defined(POWER10) | |||
#include "sswap_microk_power8.c" | |||
#include "swap_microk_power10.c" | |||
#endif | |||
#endif | |||
@@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
if ( (inc_x == 1) && (inc_y == 1 )) | |||
{ | |||
#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
#if defined(POWER10) | |||
if ( n >= 64 ) | |||
{ | |||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
@@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, | |||
double alpha_r, double alpha_i) | |||
{ | |||
#if !defined(CONJ) | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
static const double mvec[2] = { -1.0, 1.0 }; | |||
#else | |||
static const double mvec[2] = { 1.0, -1.0 }; | |||
#endif | |||
#else | |||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
static const double mvec[2] = { 1.0, -1.0 }; | |||
#else | |||
static const double mvec[2] = { -1.0, 1.0 }; | |||
#endif | |||
#endif | |||
const double *mvecp = mvec; | |||