@@ -158,7 +158,7 @@ jobs: | |||
strategy: | |||
fail-fast: false | |||
matrix: | |||
msystem: [UCRT64, MINGW32, CLANG64, CLANG32] | |||
msystem: [UCRT64, MINGW32, CLANG64] | |||
idx: [int32, int64] | |||
build-type: [Release] | |||
include: | |||
@@ -174,14 +174,6 @@ jobs: | |||
idx: int32 | |||
target-prefix: mingw-w64-clang-x86_64 | |||
fc-pkg: fc | |||
# Compiling with Flang 16 seems to cause test errors on machines | |||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
no-avx512-flags: -DNO_AVX512=1 | |||
- msystem: CLANG32 | |||
idx: int32 | |||
target-prefix: mingw-w64-clang-i686 | |||
fc-pkg: cc | |||
c-lapack-flags: -DC_LAPACK=ON | |||
- msystem: UCRT64 | |||
idx: int64 | |||
idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
@@ -192,9 +184,6 @@ jobs: | |||
idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
target-prefix: mingw-w64-clang-x86_64 | |||
fc-pkg: fc | |||
# Compiling with Flang 16 seems to cause test errors on machines | |||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
no-avx512-flags: -DNO_AVX512=1 | |||
- msystem: UCRT64 | |||
idx: int32 | |||
target-prefix: mingw-w64-ucrt-x86_64 | |||
@@ -203,8 +192,6 @@ jobs: | |||
exclude: | |||
- msystem: MINGW32 | |||
idx: int64 | |||
- msystem: CLANG32 | |||
idx: int64 | |||
defaults: | |||
run: | |||
@@ -280,8 +267,6 @@ jobs: | |||
-DNUM_THREADS=64 \ | |||
-DTARGET=CORE2 \ | |||
${{ matrix.idx64-flags }} \ | |||
${{ matrix.c-lapack-flags }} \ | |||
${{ matrix.no-avx512-flags }} \ | |||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
.. | |||
@@ -4,6 +4,7 @@ | |||
cmake_minimum_required(VERSION 3.16.0) | |||
set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S") | |||
project(OpenBLAS C ASM) | |||
set(OpenBLAS_MAJOR_VERSION 0) | |||
@@ -229,3 +229,6 @@ In chronological order: | |||
* Christopher Daley <https://github.com/cdaley> | |||
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems | |||
* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | |||
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE |
@@ -351,4 +351,31 @@ endif | |||
endif | |||
else | |||
# NVIDIA HPC options necessary to enable SVE in the compiler | |||
ifeq ($(CORE), THUNDERX2T99) | |||
CCOMMON_OPT += -tp=thunderx2t99 | |||
FCOMMON_OPT += -tp=thunderx2t99 | |||
endif | |||
ifeq ($(CORE), NEOVERSEN1) | |||
CCOMMON_OPT += -tp=neoverse-n1 | |||
FCOMMON_OPT += -tp=neoverse-n1 | |||
endif | |||
ifeq ($(CORE), NEOVERSEV1) | |||
CCOMMON_OPT += -tp=neoverse-v1 | |||
FCOMMON_OPT += -tp=neoverse-v1 | |||
endif | |||
ifeq ($(CORE), NEOVERSEV2) | |||
CCOMMON_OPT += -tp=neoverse-v2 | |||
FCOMMON_OPT += -tp=neoverse-v2 | |||
endif | |||
ifeq ($(CORE), ARMV8SVE) | |||
CCOMMON_OPT += -tp=neoverse-v2 | |||
FCOMMON_OPT += -tp=neoverse-v2 | |||
endif | |||
ifeq ($(CORE), ARMV9SVE) | |||
CCOMMON_OPT += -tp=neoverse-v2 | |||
FCOMMON_OPT += -tp=neoverse-v2 | |||
endif | |||
endif |
@@ -191,22 +191,29 @@ endif | |||
#Generating OpenBLASConfig.cmake | |||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
ifneq ($(NO_SHARED),1) | |||
#ifeq logical or | |||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
ifeq ($(OSNAME), Darwin) | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
@echo "add_library(OpenBLAS::OpenBLAS SHARED IMPORTED)" | |||
@echo "target_include_directories(OpenBLAS::OpenBLAS INTERFACE \$${OpenBLAS_INCLUDE_DIRS})" | |||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) | |||
@echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_LOCATION \$${OpenBLAS_LIBRARIES})" | |||
@echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_IMPLIB \$${_OpenBLAS_ROOT_DIR}/lib/libopenblas.lib)" | |||
endif | |||
else | |||
#only static | |||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
@echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
endif | |||
#Generating OpenBLASConfigVersion.cmake | |||
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
@@ -1613,6 +1613,13 @@ NO_AFFINITY = 1 | |||
endif | |||
endif | |||
ifeq ($(ARCH), POWER) | |||
ifeq ($(DEBUG), 1) | |||
CCOMMON_OPT := $(filter-out -O%, $(CCOMMON_OPT)) -O0 | |||
FCOMMON_OPT := $(filter-out -O%, $(FCOMMON_OPT)) -O0 | |||
endif | |||
endif | |||
ifdef NO_AFFINITY | |||
ifeq ($(NO_AFFINITY), 0) | |||
override undefine NO_AFFINITY | |||
@@ -15,11 +15,14 @@ OSUOSL IBMZ-CI [ library based on GotoBLAS2 1.13 BSD version. | |||
Please read the documentation in the OpenBLAS folder: <https://github.com/OpenMathLib/OpenBLAS/docs>. | |||
For more information about OpenBLAS, please see: | |||
- The documentation at [openmathlib.org/OpenBLAS/docs/](http://www.openmathlib.org/OpenBLAS/docs), | |||
- The home page at [openmathlib.org/OpenBLAS/](http://www.openmathlib.org/OpenBLAS). | |||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: | |||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six | |||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful. | |||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare [here](https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/) or YouTube [here](https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek) may be helpful. | |||
## Binary Packages | |||
@@ -27,24 +30,29 @@ We provide official binary packages for the following platform: | |||
* Windows x86/x86_64 | |||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases). | |||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the [Releases section of the GitHub project page](https://github.com/OpenMathLib/OpenBLAS/releases). | |||
OpenBLAS is also packaged for many package managers - see [the installation section of the docs](http://www.openmathlib.org/OpenBLAS/docs/install/) for details. | |||
## Installation from Source | |||
Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code | |||
using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be | |||
sure to use the develop branch - master is several years out of date due to a change of maintainership.) | |||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. | |||
Most can also be given directly on the make or cmake command line. | |||
Obtain the source code from https://github.com/OpenMathLib/OpenBLAS/. Note that the default branch | |||
is `develop` (a `master` branch is still present, but far out of date). | |||
Build-time parameters can be chosen in `Makefile.rule`, see there for a short description of each option. | |||
Most options can also be given directly on the command line as parameters to your `make` or `cmake` invocation. | |||
### Dependencies | |||
Building OpenBLAS requires the following to be installed: | |||
* GNU Make or CMake | |||
* A C compiler, e.g. GCC or Clang | |||
* A C compiler, e.g. GCC or Clang | |||
* A Fortran compiler (optional, for LAPACK) | |||
In general, using a recent version of the compiler is strongly recommended. | |||
If a Fortran compiler is not available, it is possible to compile an older version of the included LAPACK | |||
that has been machine-translated to C. | |||
### Normal compile | |||
@@ -60,6 +68,9 @@ For building with `cmake`, the usual conventions apply, i.e. create a build dire | |||
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any | |||
build options you plan to set. | |||
For more details, see the [Building from source](http://www.openmathlib.org/OpenBLAS/docs/install/#building-from-source) | |||
section in the docs. | |||
### Cross compile | |||
Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler. | |||
@@ -76,10 +87,12 @@ Examples: | |||
make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc | |||
``` | |||
You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the | |||
You can find instructions for other cases both in the "Supported Systems" section below and in | |||
the [Building from source docs](http://www.openmathlib.org/OpenBLAS/docs/install). | |||
The `.yml` scripts included with the sources (which contain the | |||
build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints. | |||
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. | |||
When compiling for a more modern CPU target of the same architecture, e.g. `TARGET=SKYLAKEX` on a `HASWELL` host, option `CROSS=1` can be used to suppress the automatic invocation of the tests at the end of the build. | |||
### Debug version | |||
@@ -325,11 +338,14 @@ Please see Changelog.txt. | |||
## Troubleshooting | |||
* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first. | |||
* Please read the [FAQ](http://www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. | |||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. | |||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. | |||
Clang 3.0 will generate the wrong AVX binary code. | |||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. | |||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake/CooperLake AVX512 kernels | |||
* Please use LLVM version 18 and above (version 19 and above on Windows) if you plan to use | |||
its new flang compiler for Fortran | |||
* Please use GCC version 11 and above to compile OpenBLAS on the POWER architecture | |||
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`), | |||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build | |||
the library with `BIGNUMA=1`. | |||
@@ -350,4 +366,4 @@ Please see Changelog.txt. | |||
## Donation | |||
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). | |||
Please see [the donations section](http://www.openmathlib.org/OpenBLAS/docs/about/#donations) in the docs. |
@@ -43,7 +43,17 @@ have all what it takes to build OpenBLAS from source, plus `python` and | |||
$ python -mpip install numpy meson ninja pytest pytest-benchmark | |||
``` | |||
The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`. | |||
The Meson build system looks for the installed OpenBLAS using pkgconfig, so the openblas.pc created during the OpenBLAS build needs | |||
to be somewhere on the search path of pkgconfig or in a folder pointed to by the environment variable PKG_CONFIG_PATH. | |||
If you want to build the benchmark suite using flang (or flang-new) instead of gfortran for the Fortran parts, you currently need | |||
to edit the meson.build file and change the line `'fortran_std=legacy'` to `'fortran_std=none'` to work around an incompatibility | |||
between Meson and flang. | |||
If you are building and running the benchmark under MS Windows, it may be necessary to copy the generated openblas_wrap module from | |||
your build folder to the `benchmarks` folder. | |||
The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/bench_blas.py`. | |||
An ASV compatible benchmark suite is planned but currently not implemented. | |||
@@ -6,6 +6,9 @@ hostarch=`uname -m | sed -e 's/i.86/x86/'` | |||
if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then | |||
hostarch=`uname -p` | |||
fi | |||
if [ "$hostarch" = "evbarm" ]; then | |||
hostarch=`uname -p` | |||
fi | |||
case "$hostarch" in | |||
amd64) hostarch=x86_64 ;; | |||
arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;; | |||
@@ -45,13 +45,15 @@ if (NOT ONLY_CBLAS) | |||
# TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile | |||
# TODO: set FEXTRALIB flags a la f_check? | |||
if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND x${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) | |||
set(BU "_") | |||
file(APPEND ${TARGET_CONF_TEMP} | |||
"#define BUNDERSCORE _\n" | |||
"#define NEEDBUNDERSCORE 1\n" | |||
"#define NEED2UNDERSCORES 0\n") | |||
else () | |||
set (FCOMMON_OPT "${FCOMMON_OPT} /fp:precise /recursive /names:lowercase /assume:nounderscore") | |||
endif() | |||
else () | |||
#When we only build CBLAS, we set NOFORTRAN=2 | |||
@@ -269,6 +269,31 @@ if (${F_COMPILER} STREQUAL "CRAY") | |||
endif () | |||
endif () | |||
if (${F_COMPILER} STREQUAL "NAGFOR") | |||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_NAG") | |||
if (INTERFACE64) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
endif () | |||
# Options from Makefile.system | |||
# -dcfuns: Enable non-standard double precision complex intrinsic functions | |||
# -ieee=full: enables all IEEE arithmetic facilities including non-stop arithmetic. | |||
# -w=obs: Suppress warning messages about obsolescent features | |||
# -thread_safe: Compile code for safe execution in a multi-threaded environment. | |||
# -recursive: Specifies that procedures are RECURSIVE by default. | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -dcfuns -recursive -ieee=full -w=obs -thread_safe") | |||
# Options from Reference-LAPACK | |||
# Suppress compiler banner and summary | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -quiet") | |||
# Disable other common warnings | |||
# -w=x77: Suppress warning messages about Fortran 77 features | |||
# -w=ques: Suppress warning messages about questionable usage | |||
# -w=unused: Suppress warning messages about unused variables | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") | |||
if (USE_OPENMP) | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") | |||
endif () | |||
endif () | |||
# from the root Makefile - this is for lapack-netlib to compile the correct secnd file. | |||
if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
set(TIMER "INT_ETIME") | |||
@@ -1018,7 +1018,12 @@ foreach (LA_FILE ${LA_GEN_SRC}) | |||
endforeach () | |||
if (NOT C_LAPACK) | |||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") | |||
# The below line is duplicating Fortran flags but NAG has a few flags | |||
# that cannot be specified twice. It's possible this is not needed for | |||
# any compiler, but for safety, we only turn off for NAG | |||
if (NOT ${F_COMPILER} STREQUAL "NAGFOR") | |||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") | |||
endif () | |||
if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") | |||
endif() | |||
@@ -58,7 +58,7 @@ set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp") | |||
# c_check | |||
set(FU "") | |||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) | |||
if (APPLE OR (MSVC AND NOT (${CMAKE_C_COMPILER_ID} MATCHES "Clang" OR ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM"))) | |||
set(FU "_") | |||
endif() | |||
if(MINGW AND NOT MINGW64) | |||
@@ -1433,7 +1433,9 @@ else(NOT CMAKE_CROSSCOMPILING) | |||
message(STATUS "MSVC") | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
else() | |||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") | |||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
endif() | |||
if (DEFINED TARGET_CORE) | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE}) | |||
endif () | |||
@@ -382,6 +382,8 @@ if (NEED_PIC) | |||
if (NOT NOFORTRAN) | |||
if (${F_COMPILER} STREQUAL "SUN") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -pic") | |||
elseif (${F_COMPILER} STREQUAL "NAGFOR") | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") | |||
else () | |||
set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") | |||
endif () | |||
@@ -640,17 +642,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
endif () | |||
if (CMAKE_Fortran_COMPILER) | |||
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
message(STATUS "removing fortran flags") | |||
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | |||
if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") | |||
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
message(STATUS "removing fortran flags") | |||
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") | |||
endif () | |||
foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
endforeach () | |||
endif () | |||
foreach (FILTER_FLAG ${FILTER_FLAGS}) | |||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) | |||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) | |||
endforeach () | |||
endif () | |||
endif () | |||
if ("${F_COMPILER}" STREQUAL "GFORTRAN") | |||
@@ -670,6 +672,9 @@ endif () | |||
if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") | |||
endif () | |||
if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") | |||
endif () | |||
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
if ("${F_COMPILER}" STREQUAL "FLANG") | |||
@@ -10,6 +10,10 @@ if (${HOST_OS} STREQUAL "WINDOWS") | |||
set(HOST_OS WINNT) | |||
endif () | |||
if (${HOST_OS} STREQUAL "IOS") | |||
set(HOST_OS DARWIN) | |||
endif () | |||
if (${HOST_OS} STREQUAL "LINUX") | |||
# check if we're building natively on Android (TERMUX) | |||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) | |||
@@ -372,6 +372,12 @@ typedef int blasint; | |||
#endif | |||
#endif | |||
#if defined(ARCH_RISCV64) | |||
#ifndef YIELDING | |||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
#endif | |||
#endif | |||
#ifdef __EMSCRIPTEN__ | |||
#define YIELDING | |||
@@ -102,9 +102,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
#if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||
#if !defined(__APPLE__) && !defined(_WIN32) | |||
#define OPENBLAS_ARM_TYPE_FUNCTION .type REALNAME, %function ; | |||
#else | |||
#define OPENBLAS_ARM_TYPE_FUNCTION | |||
#endif | |||
#define PROLOGUE \ | |||
.arm ;\ | |||
.global REALNAME ;\ | |||
OPENBLAS_ARM_TYPE_FUNCTION \ | |||
REALNAME: | |||
#define EPILOGUE | |||
@@ -127,7 +127,7 @@ static char *cpuname_lower[] = { | |||
int get_feature(char *search) | |||
{ | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
@@ -163,7 +163,7 @@ int get_feature(char *search) | |||
int detect(void) | |||
{ | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||
@@ -314,7 +314,7 @@ void get_cpucount(void) | |||
{ | |||
int n=0; | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
@@ -608,7 +608,7 @@ void get_libname(void) | |||
void get_features(void) | |||
{ | |||
#ifdef __linux | |||
#if defined( __linux ) || defined( __NetBSD__ ) | |||
FILE *infile; | |||
char buffer[2048], *p,*t; | |||
p = (char *) NULL ; | |||
@@ -41,7 +41,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -231,7 +231,7 @@ | |||
CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -515,7 +515,7 @@ | |||
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -243,7 +243,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -283,7 +283,7 @@ | |||
SAME = LCE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -291,7 +291,7 @@ | |||
SAME = LCE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -419,7 +419,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -194,7 +194,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -237,7 +237,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -246,7 +246,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -264,7 +264,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -273,7 +273,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -386,7 +386,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -194,7 +194,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -237,7 +237,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -246,7 +246,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -264,7 +264,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -273,7 +273,7 @@ | |||
SAME = LCE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -386,7 +386,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -47,7 +47,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -139,7 +139,7 @@ | |||
CALL STEST1(SS,DS1(K),DS1(K),SFAC) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
40 RETURN | |||
@@ -232,7 +232,7 @@ | |||
CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
60 CONTINUE | |||
80 CONTINUE | |||
@@ -387,7 +387,7 @@ | |||
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
100 CONTINUE | |||
120 CONTINUE | |||
@@ -475,7 +475,7 @@ | |||
70 CONTINUE | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
40 CONTINUE | |||
60 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -239,7 +239,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -279,7 +279,7 @@ | |||
SAME = LDE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -287,7 +287,7 @@ | |||
SAME = LDE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -415,7 +415,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -189,7 +189,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -232,7 +232,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -241,7 +241,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -259,7 +259,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -268,7 +268,7 @@ | |||
SAME = LDE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -380,7 +380,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -47,7 +47,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -139,7 +139,7 @@ | |||
CALL STEST1(SS,DS1(K),DS1(K),SFAC) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
40 RETURN | |||
@@ -232,7 +232,7 @@ | |||
CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
60 CONTINUE | |||
80 CONTINUE | |||
@@ -387,7 +387,7 @@ | |||
CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
100 CONTINUE | |||
120 CONTINUE | |||
@@ -482,7 +482,7 @@ | |||
70 CONTINUE | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
40 CONTINUE | |||
60 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -239,7 +239,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -279,7 +279,7 @@ | |||
SAME = LSE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -287,7 +287,7 @@ | |||
SAME = LSE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -415,7 +415,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -188,7 +188,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -231,7 +231,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -240,7 +240,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -258,7 +258,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'T' | |||
CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -267,7 +267,7 @@ | |||
SAME = LSE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -379,7 +379,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -41,7 +41,7 @@ | |||
IF (PASS) THEN | |||
WRITE (NOUT,99998) | |||
ELSE | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
20 CONTINUE | |||
* | |||
@@ -231,7 +231,7 @@ | |||
CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -515,7 +515,7 @@ | |||
CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) | |||
ELSE | |||
WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
40 CONTINUE | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -243,7 +243,7 @@ | |||
$ GO TO 70 | |||
60 CONTINUE | |||
WRITE( NOUT, FMT = 9986 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
70 LTEST( I ) = LTESTT | |||
GO TO 50 | |||
* | |||
@@ -283,7 +283,7 @@ | |||
SAME = LZE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANS = 'T' | |||
CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, | |||
@@ -291,7 +291,7 @@ | |||
SAME = LZE( YY, YT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -419,7 +419,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -195,7 +195,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -238,7 +238,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -247,7 +247,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -265,7 +265,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -274,7 +274,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -387,7 +387,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,7 +10,7 @@ | |||
* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE | |||
* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) | |||
* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. | |||
* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. | |||
* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. | |||
* T LOGICAL FLAG, T TO TEST ERROR EXITS. | |||
* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH | |||
* 16.0 THRESHOLD VALUE OF TEST RATIO | |||
@@ -195,7 +195,7 @@ | |||
$ GO TO 50 | |||
40 CONTINUE | |||
WRITE( NOUT, FMT = 9990 )SNAMET | |||
CALL ABORT | |||
ERROR STOP | |||
50 LTEST( I ) = LTESTT | |||
GO TO 30 | |||
* | |||
@@ -238,7 +238,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -247,7 +247,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
DO 120 J = 1, N | |||
AB( J, NMAX + 1 ) = N - J + 1 | |||
@@ -265,7 +265,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
TRANSB = 'C' | |||
CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, | |||
@@ -274,7 +274,7 @@ | |||
SAME = LZE( CC, CT, N ) | |||
IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN | |||
WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
* Test each subroutine in turn. | |||
@@ -387,7 +387,7 @@ | |||
$ CLOSE ( NTRA ) | |||
CLOSE ( NOUT ) | |||
IF( FATAL ) THEN | |||
CALL ABORT | |||
ERROR STOP | |||
END IF | |||
* | |||
10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) | |||
@@ -10,6 +10,15 @@ | |||
#define int long | |||
#endif | |||
#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) | |||
//#define LAPACK_COMPLEX_STRUCTURE | |||
#define NOCHANGE | |||
#endif | |||
/* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */ | |||
#ifdef FAILED | |||
#undef FAILED | |||
#endif | |||
#define TRUE 1 | |||
#define PASSED 1 | |||
#define TEST_ROW_MJR 1 | |||
@@ -5,14 +5,14 @@ This page documents those non-standard APIs. | |||
## BLAS-like extensions | |||
| Routine | Data Types | Description | | |||
| ------------- |:------------- | :---------------| | |||
| ?axpby | s,d,c,z | like axpy with a multiplier for y | | |||
| ?gemm3m | c,z | gemm3m | | |||
| ?imatcopy | s,d,c,z | in-place transpositon/copying | | |||
| ?omatcopy | s,d,c,z | out-of-place transpositon/copying | | |||
| ?geadd | s,d,c,z | matrix add | | |||
| ?gemmt | s,d,c,z | gemm but only a triangular part updated| | |||
| Routine | Data Types | Description | | |||
| ------------- |:------------- | :-----------------------------------------------| | |||
| ?axpby | s,d,c,z | like `axpy` with a multiplier for `y` | | |||
| ?gemm3m | c,z | `gemm3m` | | |||
| ?imatcopy | s,d,c,z | in-place transposition/copying | | |||
| ?omatcopy | s,d,c,z | out-of-place transposition/copying | | |||
| ?geadd | s,d,c,z | ATLAS-like matrix add `B = α*A+β*B` | | |||
| ?gemmt | s,d,c,z | `gemm` but only a triangular part updated | | |||
## bfloat16 functionality | |||
@@ -437,49 +437,53 @@ To then use the built OpenBLAS shared library in Visual Studio: | |||
[Qt Creator](http://qt.nokia.com/products/developer-tools/). | |||
#### Windows on Arm | |||
The following tools needs to be installed to build for Windows on Arm (WoA): | |||
- Clang for Windows on Arm. | |||
Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/). | |||
E.g: LLVM 12 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe) | |||
Run the LLVM installer and ensure that LLVM is added to environment PATH. | |||
- Download and install classic Flang for Windows on Arm. | |||
Classic Flang is the only available Fortran compiler for Windows on Arm for now. | |||
A pre-release build can be found [here](https://github.com/kaadam/flang/releases/tag/v0.1) | |||
There is no installer for classic flang and the zip package can be | |||
extracted and the path needs to be added to environment `PATH`. | |||
E.g., in PowerShell: | |||
``` | |||
$env:Path += ";C:\flang_woa\bin" | |||
``` | |||
The following steps describe how to build the static library for OpenBLAS with and without LAPACK: | |||
1. Build OpenBLAS static library with BLAS and LAPACK routines with Make: | |||
```bash | |||
$ make CC="clang-cl" HOSTCC="clang-cl" AR="llvm-ar" BUILD_WITHOUT_LAPACK=0 NOFORTRAN=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 USE_OPENMP=0 PARALLEL=1 RANLIB="llvm-ranlib" MAKE=make F_COMPILER=FLANG FC=FLANG FFLAGS_NOOPT="-march=armv8-a -cpp" FFLAGS="-march=armv8-a -cpp" NEED_PIC=0 HOSTARCH=arm64 libs netlib | |||
``` | |||
### Windows on Arm | |||
A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool). | |||
(Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using | |||
the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) | |||
1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) | |||
```cmd | |||
git clone https://github.com/OpenMathLib/OpenBLAS.git | |||
cd OpenBLAS | |||
git checkout v0.3.28 | |||
``` | |||
2. Install Latest LLVM toolchain for WoA: | |||
Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages | |||
usually lag a week or two behind their corresponding source release). | |||
Make sure to enable the option “Add LLVM to the system PATH for all the users” | |||
Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path | |||
3. Launch the Native Command Prompt for Windows ARM64: | |||
From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” | |||
Alternatively open command prompt, run the following command to activate the environment: | |||
"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" | |||
Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: | |||
```cmd | |||
cd OpenBLAS | |||
mkdir build | |||
cd build | |||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new | |||
2. Build static library with BLAS routines using CMake: | |||
ninja -j16 | |||
``` | |||
Note: You might want to include additional options in the cmake command here. For example, the default configuration only generates a static.lib version of the library. If you prefer a DLL, you can add -DBUILD_SHARED_LIBS=ON. | |||
Classic Flang has compatibility issues with CMake, hence only BLAS routines can be compiled with CMake: | |||
Note that it is also possible to use the same setup to build OpenBLAS with Make, if you prepare Makefiles over the CMake build for some reason: | |||
```bash | |||
$ mkdir build | |||
$ cd build | |||
$ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang -DBUILD_WITHOUT_LAPACK=1 -DNOFORTRAN=1 -DDYNAMIC_ARCH=0 -DTARGET=ARMV8 -DARCH=arm64 -DBINARY=64 -DUSE_OPENMP=0 -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_CROSSCOMPILING=1 -DCMAKE_SYSTEM_NAME=Windows | |||
$ cmake --build . --config Release | |||
```cmd | |||
$ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make | |||
``` | |||
!!! tip "`getarch.exe` execution error" | |||
If you notice that platform-specific headers by `getarch.exe` are not | |||
generated correctly, this could be due to a known debug runtime DLL issue for | |||
arm64 platforms. Please check out [this page](https://linaro.atlassian.net/wiki/spaces/WOAR/pages/28677636097/Debug+run-time+DLL+issue#Workaround) | |||
for a workaround. | |||
#### Generating an import library | |||
@@ -532,7 +536,6 @@ In your shell, move to this directory: `cd exports`. | |||
To build OpenBLAS for Android, you will need the following tools installed on your machine: | |||
- [The Android NDK](https://developer.android.com/ndk/) | |||
- Perl | |||
- Clang compiler on the build machine | |||
The next two sections below describe how to build with Clang for ARMV7 and | |||
@@ -574,7 +577,9 @@ utility in the make command above, like so: | |||
AR=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-gcc-ar | |||
``` | |||
otherwise you may get a linker error complaining like `malformed archive header | |||
name at 8` when the native macOS `ar` command was invoked instead. | |||
name at 8` when the native macOS `ar` command was invoked instead. Note that | |||
with recent NDK versions, the AR tool may be named `llvm-ar` rather than what | |||
is assumed above. | |||
#### Building for ARMV8 | |||
@@ -604,12 +609,17 @@ Note: for NDK 23b, something as simple as: | |||
export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH | |||
make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 | |||
``` | |||
appears to be sufficient on Linux. | |||
appears to be sufficient on Linux. On OSX, setting AR to the ar provided in the | |||
"bin" path of the NDK (probably `llvm-ar`) is also necessary. | |||
??? note "Alternative build script for 3 architectures" | |||
This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, `X86`) and install them to `/opt/OpenBLAS/lib`. | |||
This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, | |||
`X86`) and install them to `/opt/OpenBLAS/lib`. Of course you can also copy | |||
only the section that is of interest to you - also notice that the `AR=` | |||
line may need adapting to the name of the ar tool provided in your | |||
`$TOOLCHAIN/bin` - for example `llvm-ar` in some recent NDK versions. | |||
It was tested on macOS with NDK version 21.3.6528147. | |||
```bash | |||
@@ -126,6 +126,18 @@ void openblas_set_num_threads(int num_threads) { | |||
goto_set_num_threads(num_threads); | |||
} | |||
#ifdef OS_LINUX | |||
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { | |||
fprintf(stderr,"OpenBLAS: use OpenMP environment variables for setting cpu affinity\n"); | |||
return -1; | |||
} | |||
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { | |||
fprintf(stderr,"OpenBLAS: use OpenMP environment variables for querying cpu affinity\n"); | |||
return -1; | |||
} | |||
#endif | |||
int blas_thread_init(void){ | |||
#if defined(__FreeBSD__) && defined(__clang__) | |||
@@ -2538,7 +2538,7 @@ static void *alloc_shm(void *address){ | |||
} | |||
#endif | |||
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS | |||
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) | |||
static void alloc_hugetlb_free(struct release_t *release){ | |||
@@ -3254,7 +3254,7 @@ void blas_shutdown(void){ | |||
#endif | |||
newmemory[pos].lock = 0; | |||
} | |||
free(newmemory); | |||
free((void*)newmemory); | |||
newmemory = NULL; | |||
memory_overflowed = 0; | |||
} | |||
@@ -869,8 +869,12 @@ lapackobjs2z="$lapackobjs2z | |||
#functions added post 3.11 | |||
lapackobjs2c="$lapackobjs2c | |||
cgelst | |||
cgeqp3rk | |||
claqp2rk | |||
claqp3rk | |||
clatrs3 | |||
crscl | |||
ctrsyl3 | |||
" | |||
# claqz0 | |||
@@ -894,6 +898,16 @@ lapackobjs2d="$lapackobjs2d | |||
# dlaqz3 | |||
# dlaqz4 | |||
lapackobjs2s="$lapackobjs2s | |||
sgelst | |||
sgeqp3rk | |||
slaqp2rk | |||
slaqp3rk | |||
slarmm | |||
slatrs3 | |||
strsyl3 | |||
" | |||
lapackobjs2z="$lapackobjs2z | |||
zgelst | |||
zgeqp3rk | |||
@@ -2,5 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
SGEMVNKERNEL = gemv_n_sve.c | |||
DGEMVNKERNEL = gemv_n_sve.c | |||
SGEMVTKERNEL = gemv_t_sve.c | |||
DGEMVTKERNEL = gemv_t_sve.c | |||
SGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v4x3.c |
@@ -64,8 +64,8 @@ DAXPYKERNEL = daxpy_thunderx2t99.S | |||
CAXPYKERNEL = zaxpy.S | |||
ZAXPYKERNEL = zaxpy.S | |||
SROTKERNEL = rot.S | |||
DROTKERNEL = rot.S | |||
SROTKERNEL = rot.c | |||
DROTKERNEL = rot.c | |||
CROTKERNEL = zrot.S | |||
ZROTKERNEL = zrot.S | |||
@@ -94,8 +94,8 @@ DCOPYKERNEL = copy_thunderx2t99.c | |||
CCOPYKERNEL = copy_thunderx2t99.c | |||
ZCOPYKERNEL = copy_thunderx2t99.c | |||
SSWAPKERNEL = swap_thunderx2t99.S | |||
DSWAPKERNEL = swap_thunderx2t99.S | |||
SSWAPKERNEL = swap.c | |||
DSWAPKERNEL = swap.c | |||
CSWAPKERNEL = swap_thunderx2t99.S | |||
ZSWAPKERNEL = swap_thunderx2t99.S | |||
@@ -104,10 +104,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
ICAMAXKERNEL = izamax_thunderx2t99.c | |||
IZAMAXKERNEL = izamax_thunderx2t99.c | |||
SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
@@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
ICAMAXKERNEL = izamax_thunderx2t99.c | |||
IZAMAXKERNEL = izamax_thunderx2t99.c | |||
SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
SNRM2KERNEL = nrm2.S | |||
DNRM2KERNEL = nrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
DDOTKERNEL = dot.c | |||
SDOTKERNEL = dot.c | |||
@@ -1,4 +1,4 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
SGEMVTKERNEL = gemv_t_sve.c | |||
DGEMVTKERNEL = gemv_t_sve.c | |||
SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
DGEMVTKERNEL = gemv_t_sve_v1x3.c |
@@ -0,0 +1,152 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT temp; | |||
iy = 0; | |||
if (inc_x == 1) { | |||
BLASLONG width = (n + 3 - 1) / 3; | |||
FLOAT *a0_ptr = a + lda * width * 0; | |||
FLOAT *a1_ptr = a + lda * width * 1; | |||
FLOAT *a2_ptr = a + lda * width * 2; | |||
FLOAT *y0_ptr = y + inc_y * width * 0; | |||
FLOAT *y1_ptr = y + inc_y * width * 1; | |||
FLOAT *y2_ptr = y + inc_y * width * 2; | |||
for (j = 0; j < width; j++) { | |||
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
SV_TYPE temp00_vec = SV_DUP(0.0); | |||
SV_TYPE temp01_vec = SV_DUP(0.0); | |||
SV_TYPE temp02_vec = SV_DUP(0.0); | |||
i = 0; | |||
BLASLONG sve_size = SV_COUNT(); | |||
while ((i + sve_size * 1 - 1) < m) { | |||
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
i += sve_size * 1; | |||
} | |||
if (i < m) { | |||
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
} | |||
if ((j + width * 0) < n) { | |||
temp = svaddv(SV_TRUE(), temp00_vec); | |||
y0_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 1) < n) { | |||
temp = svaddv(SV_TRUE(), temp01_vec); | |||
y1_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 2) < n) { | |||
temp = svaddv(SV_TRUE(), temp02_vec); | |||
y2_ptr[iy] += alpha * temp; | |||
} | |||
iy += inc_y; | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
} | |||
return(0); | |||
} | |||
a_ptr = a; | |||
for (j = 0; j < n; j++) { | |||
temp = 0.0; | |||
ix = 0; | |||
for (i = 0; i < m; i++) { | |||
temp += a_ptr[i] * x[ix]; | |||
ix += inc_x; | |||
} | |||
y[iy] += alpha * temp; | |||
iy += inc_y; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,234 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2024, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <arm_sve.h> | |||
#include "common.h" | |||
#ifdef DOUBLE | |||
#define SV_COUNT svcntd | |||
#define SV_TYPE svfloat64_t | |||
#define SV_TRUE svptrue_b64 | |||
#define SV_WHILE svwhilelt_b64_s64 | |||
#define SV_DUP svdup_f64 | |||
#else | |||
#define SV_COUNT svcntw | |||
#define SV_TYPE svfloat32_t | |||
#define SV_TRUE svptrue_b32 | |||
#define SV_WHILE svwhilelt_b32_s64 | |||
#define SV_DUP svdup_f32 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
FLOAT *buffer) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
BLASLONG j; | |||
FLOAT *a_ptr; | |||
FLOAT temp; | |||
iy = 0; | |||
if (inc_x == 1) { | |||
BLASLONG width = (n + 3 - 1) / 3; | |||
FLOAT *a0_ptr = a + lda * width * 0; | |||
FLOAT *a1_ptr = a + lda * width * 1; | |||
FLOAT *a2_ptr = a + lda * width * 2; | |||
FLOAT *y0_ptr = y + inc_y * width * 0; | |||
FLOAT *y1_ptr = y + inc_y * width * 1; | |||
FLOAT *y2_ptr = y + inc_y * width * 2; | |||
for (j = 0; j < width; j++) { | |||
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
SV_TYPE temp00_vec = SV_DUP(0.0); | |||
SV_TYPE temp10_vec = SV_DUP(0.0); | |||
SV_TYPE temp20_vec = SV_DUP(0.0); | |||
SV_TYPE temp30_vec = SV_DUP(0.0); | |||
SV_TYPE temp01_vec = SV_DUP(0.0); | |||
SV_TYPE temp11_vec = SV_DUP(0.0); | |||
SV_TYPE temp21_vec = SV_DUP(0.0); | |||
SV_TYPE temp31_vec = SV_DUP(0.0); | |||
SV_TYPE temp02_vec = SV_DUP(0.0); | |||
SV_TYPE temp12_vec = SV_DUP(0.0); | |||
SV_TYPE temp22_vec = SV_DUP(0.0); | |||
SV_TYPE temp32_vec = SV_DUP(0.0); | |||
i = 0; | |||
BLASLONG sve_size = SV_COUNT(); | |||
while ((i + sve_size * 4 - 1) < m) { | |||
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); | |||
SV_TYPE x1_vec = svld1_vnum(SV_TRUE(), x + i, 1); | |||
SV_TYPE x2_vec = svld1_vnum(SV_TRUE(), x + i, 2); | |||
SV_TYPE x3_vec = svld1_vnum(SV_TRUE(), x + i, 3); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); | |||
temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); | |||
temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); | |||
temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); | |||
temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); | |||
temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); | |||
temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); | |||
i += sve_size * 4; | |||
} | |||
if (i < m) { | |||
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||
svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||
svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||
pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||
pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||
pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||
pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||
pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||
pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||
pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||
pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||
pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||
SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); | |||
SV_TYPE x1_vec = svld1_vnum(pg1, x + i, 1); | |||
SV_TYPE x2_vec = svld1_vnum(pg2, x + i, 2); | |||
SV_TYPE x3_vec = svld1_vnum(pg3, x + i, 3); | |||
SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); | |||
temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); | |||
temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); | |||
temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); | |||
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); | |||
temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); | |||
temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); | |||
temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); | |||
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); | |||
temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); | |||
temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); | |||
temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); | |||
} | |||
temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp10_vec); | |||
temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp11_vec); | |||
temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp12_vec); | |||
temp20_vec = svadd_x(SV_TRUE(), temp20_vec, temp30_vec); | |||
temp21_vec = svadd_x(SV_TRUE(), temp21_vec, temp31_vec); | |||
temp22_vec = svadd_x(SV_TRUE(), temp22_vec, temp32_vec); | |||
temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp20_vec); | |||
temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp21_vec); | |||
temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp22_vec); | |||
if ((j + width * 0) < n) { | |||
temp = svaddv(SV_TRUE(), temp00_vec); | |||
y0_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 1) < n) { | |||
temp = svaddv(SV_TRUE(), temp01_vec); | |||
y1_ptr[iy] += alpha * temp; | |||
} | |||
if ((j + width * 2) < n) { | |||
temp = svaddv(SV_TRUE(), temp02_vec); | |||
y2_ptr[iy] += alpha * temp; | |||
} | |||
iy += inc_y; | |||
a0_ptr += lda; | |||
a1_ptr += lda; | |||
a2_ptr += lda; | |||
} | |||
return(0); | |||
} | |||
a_ptr = a; | |||
for (j = 0; j < n; j++) { | |||
temp = 0.0; | |||
ix = 0; | |||
for (i = 0; i < m; i++) { | |||
temp += a_ptr[i] * x[ix]; | |||
ix += inc_x; | |||
} | |||
y[iy] += alpha * temp; | |||
iy += inc_y; | |||
a_ptr += lda; | |||
} | |||
return(0); | |||
} |
@@ -0,0 +1,40 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
#include "rot_kernel_sve.c" | |||
#include "rot_kernel_c.c" | |||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
if (n <= 0) | |||
return (0); | |||
if (inc_x == 1 && inc_y == 1) | |||
rot_kernel_sve(n, x, y, c, s); | |||
else | |||
rot_kernel_c(n, x, inc_x, y, inc_y, c, s); | |||
return (0); | |||
} |
@@ -0,0 +1,44 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0, iy = 0; | |||
FLOAT temp; | |||
while (i < n) | |||
{ | |||
temp = c * x[ix] + s * y[iy]; | |||
y[iy] = c * y[iy] - s * x[ix]; | |||
x[ix] = temp; | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,59 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
#ifdef DOUBLE | |||
#define SVE_TYPE svfloat64_t | |||
#define SVE_ZERO svdup_f64(0.0) | |||
#define SVE_WHILELT svwhilelt_b64 | |||
#define SVE_ALL svptrue_b64() | |||
#define SVE_WIDTH svcntd() | |||
#else | |||
#define SVE_TYPE svfloat32_t | |||
#define SVE_ZERO svdup_f32(0.0) | |||
#define SVE_WHILELT svwhilelt_b32 | |||
#define SVE_ALL svptrue_b32() | |||
#define SVE_WIDTH svcntw() | |||
#endif | |||
static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) | |||
{ | |||
for (BLASLONG i = 0; i < n; i += SVE_WIDTH) | |||
{ | |||
svbool_t pg = SVE_WHILELT((uint64_t)i, (uint64_t)n); | |||
SVE_TYPE x_vec = svld1(pg, &x[i]); | |||
SVE_TYPE y_vec = svld1(pg, &y[i]); | |||
SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); | |||
SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); | |||
SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); | |||
SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); | |||
svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); | |||
svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,40 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include "swap_kernel_sve.c" | |||
#include "swap_kernel_c.c" | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
{ | |||
if (n <= 0) | |||
return 0; | |||
if (inc_x == 1 && inc_y == 1) | |||
swap_kernel_sve(n, x, y); | |||
else | |||
swap_kernel_c(n, x, inc_x, y, inc_y); | |||
return (0); | |||
} |
@@ -0,0 +1,46 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <stdio.h> | |||
static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
BLASLONG i = 0; | |||
BLASLONG ix = 0, iy = 0; | |||
FLOAT temp; | |||
while (i < n) | |||
{ | |||
temp = x[ix]; | |||
x[ix] = y[iy]; | |||
y[iy] = temp; | |||
ix += inc_x; | |||
iy += inc_y; | |||
i++; | |||
} | |||
return (0); | |||
} |
@@ -0,0 +1,62 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#include "common.h" | |||
#include <arm_sve.h> | |||
#ifdef DOUBLE | |||
#define SVE_TYPE svfloat64_t | |||
#define SVE_ZERO svdup_f64(0.0) | |||
#define SVE_WHILELT svwhilelt_b64 | |||
#define SVE_ALL svptrue_b64() | |||
#define SVE_WIDTH svcntd() | |||
#else | |||
#define SVE_TYPE svfloat32_t | |||
#define SVE_ZERO svdup_f32(0.0) | |||
#define SVE_WHILELT svwhilelt_b32 | |||
#define SVE_ALL svptrue_b32() | |||
#define SVE_WIDTH svcntw() | |||
#endif | |||
static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) | |||
{ | |||
BLASLONG sve_width = SVE_WIDTH; | |||
for (BLASLONG i = 0; i < n; i += sve_width * 2) | |||
{ | |||
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); | |||
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); | |||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); | |||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); | |||
SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); | |||
SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); | |||
svst1(pg_a, &x[i], y_vec_a); | |||
svst1(pg_a, &y[i], x_vec_a); | |||
svst1(pg_b, &x[i + sve_width], y_vec_b); | |||
svst1(pg_b, &y[i + sve_width], x_vec_b); | |||
} | |||
return (0); | |||
} |
@@ -35,11 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||
#define INT_V_T vint32m2_t | |||
#define VID_V_INT __riscv_vid_v_i32m2 | |||
#define VID_V_INT __riscv_vid_v_u32m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
#define VBOOL_T vbool16_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
@@ -48,11 +49,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||
#define INT_V_T vint64m2_t | |||
#define VID_V_INT __riscv_vid_v_i64m2 | |||
#define VID_V_INT __riscv_vid_v_u64m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
#define VBOOL_T vbool32_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 | |||
#endif | |||
// Optimizes the implementation in ../generic/symm_lcopy_4.c | |||
@@ -70,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
INT_V_T vindex_max, vindex; | |||
size_t vl = VSETVL_MAX; | |||
vindex_max = VID_V_INT(vl); | |||
vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); | |||
for (js = n; js > 0; js -= vl, posX += vl) { | |||
vl = VSETVL(js); | |||
@@ -98,4 +100,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
return 0; | |||
} | |||
@@ -35,11 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||
#define INT_V_T vint32m2_t | |||
#define VID_V_INT __riscv_vid_v_i32m2 | |||
#define VID_V_INT __riscv_vid_v_u32m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
#define VBOOL_T vbool16_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
@@ -48,11 +49,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||
#define INT_V_T vint64m2_t | |||
#define VID_V_INT __riscv_vid_v_i64m2 | |||
#define VID_V_INT __riscv_vid_v_u64m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
#define VBOOL_T vbool32_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 | |||
#endif | |||
// Optimizes the implementation in ../generic/symm_ucopy_4.c | |||
@@ -70,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
INT_V_T vindex_max, vindex; | |||
size_t vl = VSETVL_MAX; | |||
vindex_max = VID_V_INT(vl); | |||
vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); | |||
for (js = n; js > 0; js -= vl, posX += vl) { | |||
vl = VSETVL(js); | |||
@@ -97,4 +99,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
} | |||
return 0; | |||
} | |||
} |
@@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 | |||
#define INT_V_T vint32m2_t | |||
#define VID_V_INT __riscv_vid_v_i32m2 | |||
#define VID_V_INT __riscv_vid_v_u32m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
@@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VBOOL_T vbool16_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
@@ -64,7 +65,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 | |||
#define INT_V_T vint64m2_t | |||
#define VID_V_INT __riscv_vid_v_i64m2 | |||
#define VID_V_INT __riscv_vid_v_u64m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
@@ -73,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VBOOL_T vbool32_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 | |||
#endif | |||
@@ -92,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
INT_V_T vindex_max, vindex; | |||
size_t vl = VSETVL_MAX; | |||
vindex_max = VID_V_INT(vl); | |||
vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); | |||
vzero = VFMVVF_FLOAT(ZERO, vl); | |||
for (js = n; js > 0; js -= vl, posX += vl) { | |||
@@ -136,4 +138,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
return 0; | |||
} | |||
@@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 | |||
#define INT_V_T vint32m2_t | |||
#define VID_V_INT __riscv_vid_v_i32m2 | |||
#define VID_V_INT __riscv_vid_v_u32m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
@@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VBOOL_T vbool16_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
@@ -64,7 +65,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 | |||
#define INT_V_T vint64m2_t | |||
#define VID_V_INT __riscv_vid_v_i64m2 | |||
#define VID_V_INT __riscv_vid_v_u64m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
#define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
@@ -73,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VBOOL_T vbool32_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 | |||
#endif | |||
@@ -90,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
INT_V_T vindex_max, vindex; | |||
size_t vl = VSETVL_MAX; | |||
vindex_max = VID_V_INT(vl); | |||
vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); | |||
vzero = VFMVVF_FLOAT(ZERO, vl); | |||
for (js = n; js > 0; js -= vl, posX += vl) { | |||
@@ -132,4 +134,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
} | |||
return 0; | |||
} | |||
} |
@@ -41,11 +41,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 | |||
#define INT_V_T vint32m2_t | |||
#define VID_V_INT __riscv_vid_v_i32m2 | |||
#define VID_V_INT __riscv_vid_v_u32m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
#define VBOOL_T vbool16_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
@@ -60,11 +61,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 | |||
#define INT_V_T vint64m2_t | |||
#define VID_V_INT __riscv_vid_v_i64m2 | |||
#define VID_V_INT __riscv_vid_v_u64m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
#define VBOOL_T vbool32_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) | |||
@@ -81,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
INT_V_T vindex_max, vindex; | |||
size_t vl = VSETVL_MAX; | |||
vindex_max = VID_V_INT(vl); | |||
vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); | |||
for (js = n; js > 0; js -= vl, posX += vl) { | |||
vl = VSETVL(js); | |||
@@ -118,4 +120,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
return 0; | |||
} | |||
@@ -41,11 +41,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 | |||
#define INT_V_T vint32m2_t | |||
#define VID_V_INT __riscv_vid_v_i32m2 | |||
#define VID_V_INT __riscv_vid_v_u32m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
#define VBOOL_T vbool16_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
@@ -60,11 +61,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 | |||
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 | |||
#define INT_V_T vint64m2_t | |||
#define VID_V_INT __riscv_vid_v_i64m2 | |||
#define VID_V_INT __riscv_vid_v_u64m2 | |||
#define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
#define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
#define VBOOL_T vbool32_t | |||
#define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 | |||
#endif | |||
@@ -83,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
size_t vl = VSETVL_MAX; | |||
vindex_max = VID_V_INT(vl); | |||
vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); | |||
for (js = n; js > 0; js -= vl, posX += vl) { | |||
vl = VSETVL(js); | |||
@@ -118,4 +120,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
} | |||
return 0; | |||
} | |||
} |
@@ -42,10 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 | |||
#define VBOOL_T vbool16_t | |||
#define UINT_V_T vint32m2_t | |||
#define VID_V_UINT __riscv_vid_v_i32m2 | |||
#define VID_V_UINT __riscv_vid_v_u32m2 | |||
#define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 | |||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 | |||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 | |||
#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
#define FLOAT_V_T vfloat64m2_t | |||
@@ -63,6 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 | |||
#define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 | |||
#define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 | |||
#define V_UM2_TO_IM2(values) values | |||
#endif | |||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
@@ -99,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
} | |||
i = 0; | |||
do | |||
do | |||
{ | |||
if (X > posY) | |||
{ | |||
@@ -119,9 +121,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
X ++; | |||
i ++; | |||
} | |||
else | |||
else | |||
{ | |||
vindex = VID_V_UINT(vl); | |||
vindex = V_UM2_TO_IM2(VID_V_UINT(vl)); | |||
for (unsigned int j = 0; j < vl; j++) | |||
{ | |||
vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); | |||
@@ -152,4 +154,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
} | |||
return 0; | |||
} | |||
} |
@@ -67,8 +67,14 @@ extern "C" { | |||
#define lapack_logical lapack_int | |||
#endif | |||
#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) | |||
#define LAPACK_COMPLEX_STRUCTURE | |||
#define LAPACK_GLOBAL(lcname,UCNAME) lcname | |||
#define NOCHANGE | |||
#endif | |||
#ifndef LAPACK_COMPLEX_CUSTOM | |||
#if defined(_MSC_VER) | |||
#if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER) | |||
#define _CRT_USE_C_COMPLEX_H | |||
#include <complex.h> | |||
#define LAPACK_COMPLEX_CUSTOM | |||
@@ -378,7 +378,7 @@ | |||
EXTERNAL CLARF, CLARFG, CSWAP | |||
* .. | |||
* .. Intrinsic Functions .. | |||
INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT | |||
INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, SQRT | |||
* .. | |||
* .. External Functions .. | |||
LOGICAL SISNAN | |||
@@ -599,8 +599,8 @@ | |||
* | |||
IF( SISNAN( REAL( TAU(KK) ) ) ) THEN | |||
TAUNAN = REAL( TAU(KK) ) | |||
ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN | |||
TAUNAN = IMAG( TAU(KK) ) | |||
ELSE IF( SISNAN( AIMAG( TAU(KK) ) ) ) THEN | |||
TAUNAN = AIMAG( TAU(KK) ) | |||
ELSE | |||
TAUNAN = ZERO | |||
END IF | |||
@@ -431,7 +431,7 @@ | |||
EXTERNAL CGEMM, CGEMV, CLARFG, CSWAP | |||
* .. | |||
* .. Intrinsic Functions .. | |||
INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT | |||
INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, SQRT | |||
* .. | |||
* .. External Functions .. | |||
LOGICAL SISNAN | |||
@@ -739,8 +739,8 @@ | |||
* | |||
IF( SISNAN( REAL( TAU(K) ) ) ) THEN | |||
TAUNAN = REAL( TAU(K) ) | |||
ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN | |||
TAUNAN = IMAG( TAU(K) ) | |||
ELSE IF( SISNAN( AIMAG( TAU(K) ) ) ) THEN | |||
TAUNAN = AIMAG( TAU(K) ) | |||
ELSE | |||
TAUNAN = ZERO | |||
END IF | |||
@@ -852,8 +852,9 @@ | |||
CALL SLASUM( 'CHB', NOUNIT, NERRS, NTESTT ) | |||
RETURN | |||
* | |||
9999 FORMAT( ' CCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', | |||
$ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) | |||
9999 FORMAT( ' CCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, | |||
$ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, | |||
$ ')' ) | |||
9998 FORMAT( / 1X, A3, | |||
$ ' -- Complex Hermitian Banded Tridiagonal Reduction Routines' | |||
$ ) | |||
@@ -840,8 +840,9 @@ | |||
CALL DLASUM( 'DSB', NOUNIT, NERRS, NTESTT ) | |||
RETURN | |||
* | |||
9999 FORMAT( ' DCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', | |||
$ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) | |||
9999 FORMAT( ' DCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, | |||
$ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, | |||
$ ')' ) | |||
* | |||
9998 FORMAT( / 1X, A3, | |||
$ ' -- Real Symmetric Banded Tridiagonal Reduction Routines' ) | |||
@@ -840,8 +840,9 @@ | |||
CALL SLASUM( 'SSB', NOUNIT, NERRS, NTESTT ) | |||
RETURN | |||
* | |||
9999 FORMAT( ' SCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', | |||
$ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) | |||
9999 FORMAT( ' SCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, | |||
$ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, | |||
$ ')' ) | |||
* | |||
9998 FORMAT( / 1X, A3, | |||
$ ' -- Real Symmetric Banded Tridiagonal Reduction Routines' ) | |||
@@ -849,8 +849,9 @@ | |||
CALL DLASUM( 'ZHB', NOUNIT, NERRS, NTESTT ) | |||
RETURN | |||
* | |||
9999 FORMAT( ' ZCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', | |||
$ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) | |||
9999 FORMAT( ' ZCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, | |||
$ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, | |||
$ ')' ) | |||
9998 FORMAT( / 1X, A3, | |||
$ ' -- Complex Hermitian Banded Tridiagonal Reduction Routines' | |||
$ ) | |||
@@ -954,7 +954,7 @@ | |||
$ 4X, '10. Random, Last columns are zero starting from', | |||
$ ' MINMN/2+1, CNDNUM = 2', / | |||
$ 4X, '11. Random, Half MINMN columns in the middle are', | |||
$ ' zero starting from MINMN/2-(MINMN/2)/2+1,' | |||
$ ' zero starting from MINMN/2-(MINMN/2)/2+1,', | |||
$ ' CNDNUM = 2', / | |||
$ 4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', / | |||
$ 4X, '13. Random, Even columns are ZERO, CNDNUM = 2', / | |||
@@ -36,7 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#include <stdlib.h> | |||
#include "ctest.h" | |||
#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) | |||
//#define LAPACK_COMPLEX_STRUCTURE | |||
#define NOCHANGE | |||
#endif | |||
#include <common.h> | |||
#include <math.h> | |||
@@ -206,7 +206,7 @@ void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int ld | |||
* param lda_dst - leading dimension of output matrix A | |||
* param conj specifies conjugation | |||
*/ | |||
void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, | |||
void my_scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, | |||
float *a_dst, blasint lda_dst) | |||
{ | |||
blasint i, j; | |||
@@ -217,7 +217,7 @@ void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, | |||
} | |||
} | |||
void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, | |||
void my_dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, | |||
double *a_dst, blasint lda_dst) | |||
{ | |||
blasint i, j; | |||
@@ -228,7 +228,7 @@ void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, | |||
} | |||
} | |||
void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, | |||
void my_ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, | |||
float *a_dst, blasint lda_dst, int conj) | |||
{ | |||
blasint i, j; | |||
@@ -243,7 +243,7 @@ void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, | |||
} | |||
} | |||
void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, | |||
void my_zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, | |||
double *a_dst, blasint lda_dst, int conj) | |||
{ | |||
blasint i, j; | |||
@@ -65,12 +65,12 @@ extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, i | |||
extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, | |||
double *a_dst, blasint lda_dst, int conj); | |||
extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, | |||
extern void my_scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, | |||
float *a_dst, blasint lda_dst); | |||
extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, | |||
extern void my_dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, | |||
double *a_dst, blasint lda_dst); | |||
extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, | |||
extern void my_ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, | |||
float *a_dst, blasint lda_dst, int conj); | |||
extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, | |||
extern void my_zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, | |||
double *a_dst, blasint lda_dst, int conj); | |||
#endif | |||
#endif |
@@ -91,7 +91,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla | |||
ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); | |||
} | |||
else { | |||
ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); | |||
my_ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); | |||
} | |||
if (api == 'F') { | |||
@@ -92,7 +92,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla | |||
ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); | |||
} | |||
else { | |||
ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); | |||
my_ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); | |||
} | |||
if (api == 'F') { | |||
@@ -86,7 +86,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl | |||
dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); | |||
} | |||
else { | |||
dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); | |||
my_dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); | |||
} | |||
if (api == 'F') { | |||
@@ -87,7 +87,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl | |||
dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); | |||
} | |||
else { | |||
dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); | |||
my_dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); | |||
} | |||
if (api == 'F') { | |||
@@ -86,7 +86,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla | |||
stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); | |||
} | |||
else { | |||
scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); | |||
my_scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); | |||
} | |||
if (api == 'F') { | |||
@@ -87,7 +87,7 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla | |||
stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); | |||
} | |||
else { | |||
scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); | |||
my_scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); | |||
} | |||
if (api == 'F') { | |||
@@ -91,7 +91,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl | |||
ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); | |||
} | |||
else { | |||
zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); | |||
my_zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); | |||
} | |||
if (api == 'F') { | |||
@@ -92,7 +92,7 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl | |||
ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); | |||
} | |||
else { | |||
zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); | |||
my_zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); | |||
} | |||
if (api == 'F') { | |||