@@ -1,4 +1,119 @@ | |||
# XXX: Precise is already deprecated, new default is Trusty. | |||
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming | |||
dist: precise | |||
sudo: false | |||
language: c | |||
compiler: gcc | |||
jobs: | |||
include: | |||
- &test-ubuntu | |||
stage: test | |||
addons: | |||
apt: | |||
packages: | |||
- gfortran | |||
before_script: &common-before | |||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
script: | |||
- set -e | |||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
- make -C test $COMMON_FLAGS $BTYPE | |||
- make -C ctest $COMMON_FLAGS $BTYPE | |||
- make -C utest $COMMON_FLAGS $BTYPE | |||
env: | |||
- TARGET_BOX=LINUX64 | |||
- BTYPE="BINARY=64" | |||
- <<: *test-ubuntu | |||
env: | |||
- TARGET_BOX=LINUX64 | |||
- BTYPE="BINARY=64 USE_OPENMP=1" | |||
- <<: *test-ubuntu | |||
env: | |||
- TARGET_BOX=LINUX64 | |||
- BTYPE="BINARY=64 INTERFACE64=1" | |||
- <<: *test-ubuntu | |||
addons: | |||
apt: | |||
packages: | |||
- gcc-multilib | |||
- gfortran-multilib | |||
env: | |||
- TARGET_BOX=LINUX32 | |||
- BTYPE="BINARY=32" | |||
- stage: test | |||
addons: | |||
apt: | |||
packages: | |||
- binutils-mingw-w64-x86-64 | |||
- gcc-mingw-w64-x86-64 | |||
- gfortran-mingw-w64-x86-64 | |||
before_script: *common-before | |||
script: | |||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
env: | |||
- TARGET_BOX=WIN64 | |||
- BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||
# Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. | |||
# These jobs needs sudo, so Travis runs them on VM-based infrastructure | |||
# which is slower than container-based infrastructure used for jobs | |||
# that don't require sudo. | |||
- &test-alpine | |||
stage: test | |||
dist: trusty | |||
sudo: true | |||
language: minimal | |||
before_install: | |||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ | |||
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" | |||
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
install: | |||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
before_script: *common-before | |||
script: | |||
- set -e | |||
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||
- alpine make -C test $COMMON_FLAGS $BTYPE | |||
- alpine make -C ctest $COMMON_FLAGS $BTYPE | |||
- alpine make -C utest $COMMON_FLAGS $BTYPE | |||
env: | |||
- TARGET_BOX=LINUX64_MUSL | |||
- BTYPE="BINARY=64" | |||
# XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, | |||
# so it's "allowed to fail" for now (see allow_failures). | |||
- &test-alpine-openmp | |||
<<: *test-alpine | |||
env: | |||
- TARGET_BOX=LINUX64_MUSL | |||
- BTYPE="BINARY=64 USE_OPENMP=1" | |||
- <<: *test-alpine | |||
env: | |||
- TARGET_BOX=LINUX64_MUSL | |||
- BTYPE="BINARY=64 INTERFACE64=1" | |||
# Build with the same flags as Alpine do in OpenBLAS package. | |||
- <<: *test-alpine | |||
env: | |||
- TARGET_BOX=LINUX64_MUSL | |||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" | |||
allow_failures: | |||
- <<: *test-alpine-openmp | |||
# whitelist | |||
branches: | |||
only: | |||
- master | |||
- develop | |||
notifications: | |||
webhooks: | |||
@@ -7,32 +122,3 @@ notifications: | |||
on_success: change # options: [always|never|change] default: always | |||
on_failure: always # options: [always|never|change] default: always | |||
on_start: never # options: [always|never|change] default: always | |||
compiler: | |||
- gcc | |||
env: | |||
- TARGET_BOX=LINUX64 BTYPE="BINARY=64" | |||
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1" | |||
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1" | |||
- TARGET_BOX=LINUX32 BTYPE="BINARY=32" | |||
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||
before_install: | |||
- sudo apt-get update -qq | |||
- sudo apt-get install -qq gfortran | |||
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi | |||
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi | |||
script: | |||
- set -e | |||
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE | |||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
# whitelist | |||
branches: | |||
only: | |||
- master | |||
- develop |
@@ -12,31 +12,36 @@ clone_folder: c:\projects\OpenBLAS | |||
init: | |||
- git config --global core.autocrlf input | |||
build: | |||
project: OpenBLAS.sln | |||
clone_depth: 5 | |||
#branches to build | |||
branches: | |||
only: | |||
- master | |||
- develop | |||
- cmake | |||
skip_tags: true | |||
matrix: | |||
fast_finish: true | |||
fast_finish: false | |||
skip_commits: | |||
# Add [av skip] to commit messages | |||
message: /\[av skip\]/ | |||
environment: | |||
matrix: | |||
- COMPILER: clang-cl | |||
- COMPILER: cl | |||
install: | |||
- if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat | |||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
- if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake | |||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 | |||
before_build: | |||
- echo Running cmake... | |||
- cd c:\projects\OpenBLAS | |||
- cmake -G "Visual Studio 12 Win64" . | |||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 12 Win64" . | |||
- if [%COMPILER%]==[clang-cl] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl . | |||
build_script: | |||
- cmake --build . | |||
test_script: | |||
- echo Running Test | |||
@@ -28,6 +28,8 @@ | |||
set(FU "") | |||
if(APPLE) | |||
set(FU "_") | |||
elseif(MSVC AND ${CMAKE_C_COMPILER_ID} MATCHES "Clang") | |||
set(FU "") | |||
elseif(MSVC) | |||
set(FU "_") | |||
elseif(UNIX) | |||
@@ -59,7 +61,8 @@ endif () | |||
# CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. | |||
# | |||
# TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check | |||
set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) | |||
set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture") | |||
if (${ARCH} STREQUAL "AMD64") | |||
set(ARCH "x86_64") | |||
endif () | |||
@@ -51,7 +51,8 @@ else() | |||
endif() | |||
add_custom_command( | |||
TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
OUTPUT ${PROJECT_BINARY_DIR}/openblas.def | |||
#TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
COMMAND perl | |||
ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||
COMMENT "Create openblas.def file" | |||
@@ -66,15 +66,14 @@ set(GETARCH_SRC | |||
${CPUIDEMO} | |||
) | |||
if (NOT MSVC) | |||
if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC") | |||
#Use generic for MSVC now | |||
message("MSVC") | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
else() | |||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
endif () | |||
if (MSVC) | |||
#Use generic for MSVC now | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
endif() | |||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
# disable WindowsStore strict CRT checks | |||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) | |||
@@ -495,6 +495,33 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) | |||
#endif | |||
#ifndef ASSEMBLER | |||
/* C99 supports complex floating numbers natively, which GCC also offers as an | |||
extension since version 3.0. If neither are available, use a compatible | |||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) | |||
#define OPENBLAS_COMPLEX_C99 | |||
#ifndef __cplusplus | |||
#include <complex.h> | |||
#endif | |||
typedef float _Complex openblas_complex_float; | |||
typedef double _Complex openblas_complex_double; | |||
typedef xdouble _Complex openblas_complex_xdouble; | |||
#define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) | |||
#define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) | |||
#define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) | |||
#else | |||
#define OPENBLAS_COMPLEX_STRUCT | |||
typedef struct { float real, imag; } openblas_complex_float; | |||
typedef struct { double real, imag; } openblas_complex_double; | |||
typedef struct { xdouble real, imag; } openblas_complex_xdouble; | |||
#define openblas_make_complex_float(real, imag) {(real), (imag)} | |||
#define openblas_make_complex_double(real, imag) {(real), (imag)} | |||
#define openblas_make_complex_xdouble(real, imag) {(real), (imag)} | |||
#endif | |||
#endif | |||
#include "param.h" | |||
#include "common_param.h" | |||
@@ -524,31 +551,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
#include <stdio.h> | |||
#endif // NOINCLUDE | |||
/* C99 supports complex floating numbers natively, which GCC also offers as an | |||
extension since version 3.0. If neither are available, use a compatible | |||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) | |||
#define OPENBLAS_COMPLEX_C99 | |||
#ifndef __cplusplus | |||
#include <complex.h> | |||
#endif | |||
typedef float _Complex openblas_complex_float; | |||
typedef double _Complex openblas_complex_double; | |||
typedef xdouble _Complex openblas_complex_xdouble; | |||
#define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) | |||
#define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) | |||
#define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) | |||
#else | |||
#define OPENBLAS_COMPLEX_STRUCT | |||
typedef struct { float real, imag; } openblas_complex_float; | |||
typedef struct { double real, imag; } openblas_complex_double; | |||
typedef struct { xdouble real, imag; } openblas_complex_xdouble; | |||
#define openblas_make_complex_float(real, imag) {(real), (imag)} | |||
#define openblas_make_complex_double(real, imag) {(real), (imag)} | |||
#define openblas_make_complex_xdouble(real, imag) {(real), (imag)} | |||
#endif | |||
#ifdef XDOUBLE | |||
#define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble | |||
#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) | |||
@@ -333,8 +333,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | |||
float (*casum_k) (BLASLONG, float *, BLASLONG); | |||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
float _Complex (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
float _Complex (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -496,8 +496,8 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||
double (*znrm2_k) (BLASLONG, double *, BLASLONG); | |||
double (*zasum_k) (BLASLONG, double *, BLASLONG); | |||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
double _Complex (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
double _Complex (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
@@ -661,8 +661,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
xdouble _Complex (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
xdouble _Complex (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
@@ -230,8 +230,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT | |||
#ifndef TRANSA | |||
range_m[num_cpu] = num_cpu * ((m + 15) & ~15); | |||
if (range_m[num_cpu] > m) range_m[num_cpu] = m; | |||
#else | |||
range_m[num_cpu] = num_cpu * ((n + 15) & ~15); | |||
if (range_m[num_cpu] > n) range_m[num_cpu] = n; | |||
#endif | |||
queue[num_cpu].mode = mode; | |||
@@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = sbmv_kernel; | |||
@@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = sbmv_kernel; | |||
@@ -316,6 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * ((n + 15) & ~15); | |||
if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = sbmv_kernel; | |||
@@ -246,6 +246,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = spmv_kernel; | |||
@@ -285,6 +286,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = spmv_kernel; | |||
@@ -177,7 +177,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; | |||
queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; | |||
queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; | |||
@@ -225,6 +226,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = symv_kernel; | |||
@@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -307,7 +307,8 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = tpmv_kernel; | |||
queue[num_cpu].args = &args; | |||
@@ -346,6 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = tpmv_kernel; | |||
@@ -346,6 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -385,6 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].routine = trmv_kernel; | |||
@@ -155,7 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef DYNAMIC_ARCH | |||
gotoblas_t *gotoblas = NULL; | |||
#endif | |||
extern void openblas_warning(int verbose, const char * msg); | |||
#ifndef SMP | |||
@@ -187,25 +186,24 @@ int i,n; | |||
#if !defined(__GLIBC_PREREQ) | |||
return nums; | |||
#endif | |||
#if !__GLIBC_PREREQ(2, 3) | |||
#else | |||
#if !__GLIBC_PREREQ(2, 3) | |||
return nums; | |||
#endif | |||
#endif | |||
#if !__GLIBC_PREREQ(2, 7) | |||
#if !__GLIBC_PREREQ(2, 7) | |||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
if (ret!=0) return nums; | |||
n=0; | |||
#if !__GLIBC_PREREQ(2, 6) | |||
#if !__GLIBC_PREREQ(2, 6) | |||
for (i=0;i<nums;i++) | |||
if (CPU_ISSET(i,cpusetp)) n++; | |||
nums=n; | |||
#else | |||
#else | |||
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
#endif | |||
#endif | |||
return nums; | |||
#endif | |||
#else | |||
cpusetp = CPU_ALLOC(nums); | |||
if (cpusetp == NULL) return nums; | |||
size = CPU_ALLOC_SIZE(nums); | |||
@@ -214,6 +212,8 @@ int i,n; | |||
nums = CPU_COUNT_S(size,cpusetp); | |||
CPU_FREE(cpusetp); | |||
return nums; | |||
#endif | |||
#endif | |||
} | |||
#endif | |||
#endif | |||
@@ -1,7 +1,6 @@ | |||
include_directories(${PROJECT_SOURCE_DIR}) | |||
# Makefile | |||
function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
set (OPENBLAS_SRC "") | |||
@@ -21,7 +20,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
endif () | |||
if (${ARCH} STREQUAL "x86") | |||
if (NOT MSVC) | |||
if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") | |||
GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) | |||
else() | |||
GenerateNamedObjects("${KERNELDIR}/cpuid_win.c" "" "" false "" "" true) | |||
@@ -147,57 +147,57 @@ static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" fmov s6, "REG0" \n" | |||
" fmov s7, "REG0" \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lasum_kernel_S_BEGIN \n" | |||
" bne 5f //asum_kernel_S_BEGIN \n" | |||
".Lasum_kernel_F_BEGIN: \n" | |||
"1: //asum_kernel_F_BEGIN: \n" | |||
" asr "J", "N", #5 \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lasum_kernel_F1 \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
".Lasum_kernel_F32: \n" | |||
"2: //asum_kernel_F32: \n" | |||
" "KERNEL_F32" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F32 \n" | |||
" bne 2b //asum_kernel_F32 \n" | |||
" "KERNEL_F32_FINALIZE" \n" | |||
".Lasum_kernel_F1: \n" | |||
"3: //asum_kernel_F1: \n" | |||
" ands "J", "N", #31 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_F10: \n" | |||
"4: //asum_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F10 \n" | |||
" b .Lasum_kernel_L999 \n" | |||
" bne 4b //asum_kernel_F10 \n" | |||
" b 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S_BEGIN: \n" | |||
"5: //asum_kernel_S_BEGIN: \n" | |||
" "INIT_S" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lasum_kernel_S1 \n" | |||
" ble 7f //asum_kernel_S1 \n" | |||
".Lasum_kernel_S4: \n" | |||
"6: //asum_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S4 \n" | |||
" bne 6b //asum_kernel_S4 \n" | |||
".Lasum_kernel_S1: \n" | |||
"7: //asum_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S10: \n" | |||
"8: //asum_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S10 \n" | |||
" bne 8b //asum_kernel_S10 \n" | |||
".Lasum_kernel_L999: \n" | |||
"9: //asum_kernel_L999: \n" | |||
" fmov %[ASUM_], "SUMFD" \n" | |||
: [ASUM_] "=r" (asum) //%0 | |||
@@ -90,62 +90,62 @@ static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_ | |||
" mov "Y", %[Y_] \n" | |||
" mov "INC_Y", %[INCY_] \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lcopy_kernel_L999 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lcopy_kernel_S_BEGIN \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne .Lcopy_kernel_S_BEGIN \n" | |||
" bne 4f //copy_kernel_S_BEGIN \n" | |||
".Lcopy_kernel_F_BEGIN: \n" | |||
"// .Lcopy_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lcopy_kernel_F1 \n" | |||
" beq 2f //copy_kernel_F1 \n" | |||
" .align 5 \n" | |||
".Lcopy_kernel_F: \n" | |||
"1: //copy_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lcopy_kernel_F \n" | |||
" bne 1b //copy_kernel_F \n" | |||
".Lcopy_kernel_F1: \n" | |||
"2: //copy_kernel_F1: \n" | |||
#if defined(COMPLEX) && defined(DOUBLE) | |||
" b .Lcopy_kernel_L999 \n" | |||
" b 8f //copy_kernel_L999 \n" | |||
#else | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble .Lcopy_kernel_L999 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
#endif | |||
".Lcopy_kernel_F10: \n" | |||
"3: //copy_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lcopy_kernel_F10 \n" | |||
" b .Lcopy_kernel_L999 \n" | |||
" bne 3b //copy_kernel_F10 \n" | |||
" b 8f //copy_kernel_L999 \n" | |||
".Lcopy_kernel_S_BEGIN: \n" | |||
"4: //copy_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lcopy_kernel_S1 \n" | |||
" ble 6f //copy_kernel_S1 \n" | |||
".Lcopy_kernel_S4: \n" | |||
"5: //copy_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lcopy_kernel_S4 \n" | |||
" bne 5b //copy_kernel_S4 \n" | |||
".Lcopy_kernel_S1: \n" | |||
"6: //copy_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lcopy_kernel_L999 \n" | |||
" ble 8f //copy_kernel_L999 \n" | |||
".Lcopy_kernel_S10: \n" | |||
"7: //copy_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lcopy_kernel_S10 \n" | |||
" bne 7b //copy_kernel_S10 \n" | |||
".Lcopy_kernel_L999: \n" | |||
"8: //copy_kernel_L999: \n" | |||
: | |||
: [N_] "r" (n), //%1 | |||
@@ -141,58 +141,58 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" fmov d6, "REG0" \n" | |||
" fmov d7, "REG0" \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lasum_kernel_S_BEGIN \n" | |||
" bne 5f //asum_kernel_S_BEGIN \n" | |||
".Lasum_kernel_F_BEGIN: \n" | |||
"1: //asum_kernel_F_BEGIN: \n" | |||
" asr "J", "N", #5 \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lasum_kernel_F1 \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
".align 5 \n" | |||
".Lasum_kernel_F32: \n" | |||
"2: //asum_kernel_F32: \n" | |||
" "KERNEL_F32" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F32 \n" | |||
" bne 2b //asum_kernel_F32 \n" | |||
" "KERNEL_F32_FINALIZE" \n" | |||
".Lasum_kernel_F1: \n" | |||
"3: //asum_kernel_F1: \n" | |||
" ands "J", "N", #31 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_F10: \n" | |||
"4: //asum_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F10 \n" | |||
" b .Lasum_kernel_L999 \n" | |||
" bne 4b //asum_kernel_F10 \n" | |||
" b 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S_BEGIN: \n" | |||
"5: //asum_kernel_S_BEGIN: \n" | |||
" "INIT_S" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lasum_kernel_S1 \n" | |||
" ble 7f //asum_kernel_S1 \n" | |||
".Lasum_kernel_S4: \n" | |||
"6: //asum_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S4 \n" | |||
" bne 6b //asum_kernel_S4 \n" | |||
".Lasum_kernel_S1: \n" | |||
"7: //asum_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S10: \n" | |||
"8: //asum_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S10 \n" | |||
" bne 8b //asum_kernel_S10 \n" | |||
".Lasum_kernel_L999: \n" | |||
"9: //asum_kernel_L999: \n" | |||
" fmov %[ASUM_], "SUMF" \n" | |||
: [ASUM_] "=r" (asum) //%0 | |||
@@ -291,61 +291,61 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | |||
" fmov d6, xzr \n" | |||
" fmov d7, xzr \n" | |||
" cmp "N", xzr \n" | |||
" ble .Ldot_kernel_L999 \n" | |||
" ble 9f //dot_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Ldot_kernel_S_BEGIN \n" | |||
" bne 5f //dot_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne .Ldot_kernel_S_BEGIN \n" | |||
" bne 5f //dot_kernel_S_BEGIN \n" | |||
".Ldot_kernel_F_BEGIN: \n" | |||
"1: //dot_kernel_F_BEGIN: \n" | |||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Ldot_kernel_F1 \n" | |||
" beq 3f //dot_kernel_F1 \n" | |||
" .align 5 \n" | |||
".Ldot_kernel_F: \n" | |||
"2: //dot_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_F \n" | |||
" bne 2b //dot_kernel_F \n" | |||
" "KERNEL_F_FINALIZE" \n" | |||
".Ldot_kernel_F1: \n" | |||
"3: //dot_kernel_F1: \n" | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble .Ldot_kernel_L999 \n" | |||
" ble 9f //dot_kernel_L999 \n" | |||
".Ldot_kernel_F10: \n" | |||
"4: //dot_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_F10 \n" | |||
" b .Ldot_kernel_L999 \n" | |||
" bne 4b //dot_kernel_F10 \n" | |||
" b 9f //dot_kernel_L999 \n" | |||
".Ldot_kernel_S_BEGIN: \n" | |||
"5: //dot_kernel_S_BEGIN: \n" | |||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Ldot_kernel_S1 \n" | |||
" ble 7f //dot_kernel_S1 \n" | |||
".Ldot_kernel_S4: \n" | |||
"6: //dot_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_S4 \n" | |||
" bne 6b //dot_kernel_S4 \n" | |||
".Ldot_kernel_S1: \n" | |||
"7: //dot_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Ldot_kernel_L999 \n" | |||
" ble 9f //dot_kernel_L999 \n" | |||
".Ldot_kernel_S10: \n" | |||
"8: //dot_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_S10 \n" | |||
" bne 8b //dot_kernel_S10 \n" | |||
".Ldot_kernel_L999: \n" | |||
"9: //dot_kernel_L999: \n" | |||
" str "DOTF", [%[DOT_]] \n" | |||
: | |||
@@ -74,33 +74,33 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
" fmov "SCALE", xzr \n" | |||
" fmov "SSQ", #1.0 \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lnrm2_kernel_L999 \n" | |||
" ble 9f //nrm2_kernel_L999 \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lnrm2_kernel_L999 \n" | |||
" ble 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_F_BEGIN: \n" | |||
"1: //nrm2_kernel_F_BEGIN: \n" | |||
" fmov "REGZERO", xzr \n" | |||
" fmov "REGONE", #1.0 \n" | |||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | |||
" mov "J", "N" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lnrm2_kernel_L999 \n" | |||
" beq 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_F_ZERO_SKIP: \n" | |||
"2: //nrm2_kernel_F_ZERO_SKIP: \n" | |||
" ldr d4, ["X"] \n" | |||
" fcmp d4, "REGZERO" \n" | |||
" bne .Lnrm2_kernel_F_INIT \n" | |||
" bne 3f //nrm2_kernel_F_INIT \n" | |||
#if defined(COMPLEX) | |||
" ldr d4, ["X", #8] \n" | |||
" fcmp d4, "REGZERO" \n" | |||
" bne .Lnrm2_kernel_F_INIT_I \n" | |||
" bne 4f //nrm2_kernel_F_INIT_I \n" | |||
#endif | |||
" add "X", "X", "INC_X" \n" | |||
" subs "J", "J", #1 \n" | |||
" beq .Lnrm2_kernel_L999 \n" | |||
" b .Lnrm2_kernel_F_ZERO_SKIP \n" | |||
" beq 9f //nrm2_kernel_L999 \n" | |||
" b 2b //nrm2_kernel_F_ZERO_SKIP \n" | |||
".Lnrm2_kernel_F_INIT: \n" | |||
"3: //nrm2_kernel_F_INIT: \n" | |||
" ldr d4, ["X"] \n" | |||
" fabs d4, d4 \n" | |||
" fmax "CUR_MAX", "SCALE", d4 \n" | |||
@@ -112,7 +112,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
" fadd "SSQ", "SSQ", d4 \n" | |||
" fmov "SCALE", "CUR_MAX" \n" | |||
#if defined(COMPLEX) | |||
".Lnrm2_kernel_F_INIT_I: \n" | |||
"4: //nrm2_kernel_F_INIT_I: \n" | |||
" ldr d3, ["X", #8] \n" | |||
" fabs d3, d3 \n" | |||
" fmax "CUR_MAX", "SCALE", d3 \n" | |||
@@ -126,16 +126,16 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
#endif | |||
" add "X", "X", "INC_X" \n" | |||
" subs "J", "J", #1 \n" | |||
" beq .Lnrm2_kernel_L999 \n" | |||
" beq 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_F_START: \n" | |||
"5: //nrm2_kernel_F_START: \n" | |||
" cmp "INC_X", #"SZ" \n" | |||
" bne .Lnrm2_kernel_F1 \n" | |||
" bne 8f //nrm2_kernel_F1 \n" | |||
" asr "K", "J", #4 \n" | |||
" cmp "K", xzr \n" | |||
" beq .Lnrm2_kernel_F1 \n" | |||
" beq 8f //nrm2_kernel_F1 \n" | |||
".Lnrm2_kernel_F: \n" | |||
"6: //nrm2_kernel_F: \n" | |||
" ldp q16, q17, ["X"] \n" | |||
" ldp q18, q19, ["X", #32] \n" | |||
" ldp q20, q21, ["X", #64] \n" | |||
@@ -255,13 +255,13 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
" fmov "SCALE", "CUR_MAX" \n" | |||
#endif | |||
" subs "K", "K", #1 \n" | |||
" bne .Lnrm2_kernel_F \n" | |||
" bne 6b //nrm2_kernel_F \n" | |||
".Lnrm2_kernel_F_DONE: \n" | |||
"7: //nrm2_kernel_F_DONE: \n" | |||
" ands "J", "J", #15 \n" | |||
" beq .Lnrm2_kernel_L999 \n" | |||
" beq 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_F1: \n" | |||
"8: //nrm2_kernel_F1: \n" | |||
" ldr d4, ["X"] \n" | |||
" fabs d4, d4 \n" | |||
" fmax "CUR_MAX", "SCALE", d4 \n" | |||
@@ -286,9 +286,9 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
#endif | |||
" add "X", "X", "INC_X" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lnrm2_kernel_F1 \n" | |||
" bne 8b //nrm2_kernel_F1 \n" | |||
".Lnrm2_kernel_L999: \n" | |||
"9: //nrm2_kernel_L999: \n" | |||
" str "SSQ", [%[SSQ_]] \n" | |||
" str "SCALE", [%[SCALE_]] \n" | |||
@@ -208,7 +208,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n | |||
#endif | |||
static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
static BLASLONG __attribute__((noinline)) iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
{ | |||
BLASLONG index = 0; | |||
@@ -220,72 +220,72 @@ static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" mov "INC_X", %[INCX_] \n" | |||
" cmp "N", xzr \n" | |||
" ble .Liamax_kernel_zero \n" | |||
" ble 10f //iamax_kernel_zero \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Liamax_kernel_zero \n" | |||
" ble 10f //iamax_kernel_zero \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Liamax_kernel_S_BEGIN \n" | |||
" bne 5f //iamax_kernel_S_BEGIN \n" | |||
" mov x7, "X" \n" | |||
".Liamax_kernel_F_BEGIN: \n" | |||
"1: //iamax_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" subs "N", "N", #1 \n" | |||
" ble .Liamax_kernel_L999 \n" | |||
" ble 9f //iamax_kernel_L999 \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Liamax_kernel_F1 \n" | |||
" beq 3f //iamax_kernel_F1 \n" | |||
" add "Z", "Z", #1 \n" | |||
".Liamax_kernel_F: \n" | |||
"2: //iamax_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Liamax_kernel_F \n" | |||
" bne 2b //iamax_kernel_F \n" | |||
" "KERNEL_F_FINALIZE" \n" | |||
" sub "Z", "Z", #1 \n" | |||
".Liamax_kernel_F1: \n" | |||
"3: //iamax_kernel_F1: \n" | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble .Liamax_kernel_L999 \n" | |||
" ble 9f //iamax_kernel_L999 \n" | |||
".Liamax_kernel_F10: \n" | |||
"4: //iamax_kernel_F10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Liamax_kernel_F10 \n" | |||
" b .Liamax_kernel_L999 \n" | |||
" bne 4b //iamax_kernel_F10 \n" | |||
" b 9f //iamax_kernel_L999 \n" | |||
".Liamax_kernel_S_BEGIN: \n" | |||
"5: //iamax_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" subs "N", "N", #1 \n" | |||
" ble .Liamax_kernel_L999 \n" | |||
" ble 9f //iamax_kernel_L999 \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Liamax_kernel_S1 \n" | |||
" ble 7f //iamax_kernel_S1 \n" | |||
".Liamax_kernel_S4: \n" | |||
"6: //iamax_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Liamax_kernel_S4 \n" | |||
" bne 6b //iamax_kernel_S4 \n" | |||
".Liamax_kernel_S1: \n" | |||
"7: //iamax_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Liamax_kernel_L999 \n" | |||
" ble 9f //iamax_kernel_L999 \n" | |||
".Liamax_kernel_S10: \n" | |||
"8: //iamax_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Liamax_kernel_S10 \n" | |||
" bne 8b //iamax_kernel_S10 \n" | |||
".Liamax_kernel_L999: \n" | |||
"9: //iamax_kernel_L999: \n" | |||
" mov x0, "INDEX" \n" | |||
" b .Liamax_kernel_DONE \n" | |||
" b 11f //iamax_kernel_DONE \n" | |||
".Liamax_kernel_zero: \n" | |||
"10: //iamax_kernel_zero: \n" | |||
" mov x0, xzr \n" | |||
".Liamax_kernel_DONE: \n" | |||
"11: //iamax_kernel_DONE: \n" | |||
" mov %[INDEX_], "INDEX" \n" | |||
: [INDEX_] "=r" (index) //%0 | |||
@@ -229,72 +229,72 @@ static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" mov "INC_X", %[INCX_] \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lizamax_kernel_zero \n" | |||
" ble 10f //izamax_kernel_zero \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lizamax_kernel_zero \n" | |||
" ble 10f //izamax_kernel_zero \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lizamax_kernel_S_BEGIN \n" | |||
" bne 5f //izamax_kernel_S_BEGIN \n" | |||
" mov x7, "X" \n" | |||
".Lizamax_kernel_F_BEGIN: \n" | |||
"1: //izamax_kernel_F_BEGIN: \n" | |||
" "INIT" \n" | |||
" subs "N", "N", #1 \n" | |||
" ble .Lizamax_kernel_L999 \n" | |||
" ble 9f //izamax_kernel_L999 \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lizamax_kernel_F1 \n" | |||
" beq 3f //izamax_kernel_F1 \n" | |||
" add "Z", "Z", #1 \n" | |||
".Lizamax_kernel_F: \n" | |||
"2: //izamax_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lizamax_kernel_F \n" | |||
" bne 2b //izamax_kernel_F \n" | |||
" "KERNEL_F_FINALIZE" \n" | |||
" sub "Z", "Z", #1 \n" | |||
".Lizamax_kernel_F1: \n" | |||
"3: //izamax_kernel_F1: \n" | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble .Lizamax_kernel_L999 \n" | |||
" ble 9f //izamax_kernel_L999 \n" | |||
".Lizamax_kernel_F10: \n" | |||
"4: //izamax_kernel_F10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lizamax_kernel_F10 \n" | |||
" b .Lizamax_kernel_L999 \n" | |||
" bne 4b //izamax_kernel_F10 \n" | |||
" b 9f //izamax_kernel_L999 \n" | |||
".Lizamax_kernel_S_BEGIN: \n" | |||
"5: //izamax_kernel_S_BEGIN: \n" | |||
" "INIT" \n" | |||
" subs "N", "N", #1 \n" | |||
" ble .Lizamax_kernel_L999 \n" | |||
" ble 9f //izamax_kernel_L999 \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lizamax_kernel_S1 \n" | |||
" ble 7f //izamax_kernel_S1 \n" | |||
".Lizamax_kernel_S4: \n" | |||
"6: //izamax_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lizamax_kernel_S4 \n" | |||
" bne 6b //izamax_kernel_S4 \n" | |||
".Lizamax_kernel_S1: \n" | |||
"7: //izamax_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lizamax_kernel_L999 \n" | |||
" ble 9f //izamax_kernel_L999 \n" | |||
".Lizamax_kernel_S10: \n" | |||
"8: //izamax_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lizamax_kernel_S10 \n" | |||
" bne 8b //izamax_kernel_S10 \n" | |||
".Lizamax_kernel_L999: \n" | |||
"9: //izamax_kernel_L999: \n" | |||
" mov x0, "INDEX" \n" | |||
" b .Lizamax_kernel_DONE \n" | |||
" b 11f //izamax_kernel_DONE \n" | |||
".Lizamax_kernel_zero: \n" | |||
"10: //izamax_kernel_zero: \n" | |||
" mov x0, xzr \n" | |||
".Lizamax_kernel_DONE: \n" | |||
"11: //izamax_kernel_DONE: \n" | |||
" mov %[INDEX_], "INDEX" \n" | |||
: [INDEX_] "=r" (index) //%0 | |||
@@ -143,58 +143,58 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" fmov s6, "REG0" \n" | |||
" fmov s7, "REG0" \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lasum_kernel_S_BEGIN \n" | |||
" bne 5f //asum_kernel_S_BEGIN \n" | |||
".Lasum_kernel_F_BEGIN: \n" | |||
"1: //asum_kernel_F_BEGIN: \n" | |||
" asr "J", "N", #6 \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lasum_kernel_F1 \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
".align 5 \n" | |||
".Lasum_kernel_F64: \n" | |||
"2: //asum_kernel_F64: \n" | |||
" "KERNEL_F64" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F64 \n" | |||
" bne 2b //asum_kernel_F64 \n" | |||
" "KERNEL_F64_FINALIZE" \n" | |||
".Lasum_kernel_F1: \n" | |||
"3: //asum_kernel_F1: \n" | |||
" ands "J", "N", #63 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_F10: \n" | |||
"4: //asum_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F10 \n" | |||
" b .Lasum_kernel_L999 \n" | |||
" bne 4b //asum_kernel_F10 \n" | |||
" b 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S_BEGIN: \n" | |||
"5: //asum_kernel_S_BEGIN: \n" | |||
" "INIT_S" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lasum_kernel_S1 \n" | |||
" ble 7f //asum_kernel_S1 \n" | |||
".Lasum_kernel_S4: \n" | |||
"6: //asum_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S4 \n" | |||
" bne 6b //asum_kernel_S4 \n" | |||
".Lasum_kernel_S1: \n" | |||
"7: //asum_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S10: \n" | |||
"8: //asum_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S10 \n" | |||
" bne 8b //asum_kernel_S10 \n" | |||
".Lasum_kernel_L999: \n" | |||
"9: //asum_kernel_L999: \n" | |||
" fmov %[ASUM_], "SUMFD" \n" | |||
: [ASUM_] "=r" (asum) //%0 | |||
@@ -227,58 +227,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" fmov d6, xzr \n" | |||
" fmov d7, xzr \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lnrm2_kernel_L999 \n" | |||
" ble 9f //nrm2_kernel_L999 \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lnrm2_kernel_L999 \n" | |||
" ble 9f //nrm2_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lnrm2_kernel_S_BEGIN \n" | |||
" bne 5f //nrm2_kernel_S_BEGIN \n" | |||
".Lnrm2_kernel_F_BEGIN: \n" | |||
"1: //nrm2_kernel_F_BEGIN: \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lnrm2_kernel_S_BEGIN \n" | |||
" beq 5f //nrm2_kernel_S_BEGIN \n" | |||
" .align 5 \n" | |||
".Lnrm2_kernel_F: \n" | |||
"2: //nrm2_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lnrm2_kernel_F \n" | |||
" bne 2b //nrm2_kernel_F \n" | |||
" "KERNEL_F_FINALIZE" \n" | |||
".Lnrm2_kernel_F1: \n" | |||
"3: //nrm2_kernel_F1: \n" | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble .Lnrm2_kernel_L999 \n" | |||
" ble 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_F10: \n" | |||
"4: //nrm2_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lnrm2_kernel_F10 \n" | |||
" b .Lnrm2_kernel_L999 \n" | |||
" bne 4b //nrm2_kernel_F10 \n" | |||
" b 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_S_BEGIN: \n" | |||
"5: //nrm2_kernel_S_BEGIN: \n" | |||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lnrm2_kernel_S1 \n" | |||
" ble 7f //nrm2_kernel_S1 \n" | |||
".Lnrm2_kernel_S4: \n" | |||
"6: //nrm2_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lnrm2_kernel_S4 \n" | |||
" bne 6b //nrm2_kernel_S4 \n" | |||
".Lnrm2_kernel_S1: \n" | |||
"7: //nrm2_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lnrm2_kernel_L999 \n" | |||
" ble 9f //nrm2_kernel_L999 \n" | |||
".Lnrm2_kernel_S10: \n" | |||
"8: //nrm2_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lnrm2_kernel_S10 \n" | |||
" bne 8b //nrm2_kernel_S10 \n" | |||
".Lnrm2_kernel_L999: \n" | |||
"9: //nrm2_kernel_L999: \n" | |||
" "KERNEL_FINALIZE" \n" | |||
" fmov %[RET_], "SSQD" \n" | |||
@@ -143,58 +143,58 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
" fmov d6, "REG0" \n" | |||
" fmov d7, "REG0" \n" | |||
" cmp "N", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", xzr \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Lasum_kernel_S_BEGIN \n" | |||
" bne 5f //asum_kernel_S_BEGIN \n" | |||
".Lasum_kernel_F_BEGIN: \n" | |||
"1: //asum_kernel_F_BEGIN: \n" | |||
" asr "J", "N", #4 \n" | |||
" cmp "J", xzr \n" | |||
" beq .Lasum_kernel_F1 \n" | |||
" beq 3f //asum_kernel_F1 \n" | |||
".align 5 \n" | |||
".Lasum_kernel_F16: \n" | |||
"2: //asum_kernel_F16: \n" | |||
" "KERNEL_F16" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F16 \n" | |||
" bne 2b //asum_kernel_F16 \n" | |||
" "KERNEL_F16_FINALIZE" \n" | |||
".Lasum_kernel_F1: \n" | |||
"3: //asum_kernel_F1: \n" | |||
" ands "J", "N", #15 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_F10: \n" | |||
"4: //asum_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_F10 \n" | |||
" b .Lasum_kernel_L999 \n" | |||
" bne 4b //asum_kernel_F10 \n" | |||
" b 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S_BEGIN: \n" | |||
"5: //asum_kernel_S_BEGIN: \n" | |||
" "INIT_S" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Lasum_kernel_S1 \n" | |||
" ble 7f //asum_kernel_S1 \n" | |||
".Lasum_kernel_S4: \n" | |||
"6: //asum_kernel_S4: \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S4 \n" | |||
" bne 6b //asum_kernel_S4 \n" | |||
".Lasum_kernel_S1: \n" | |||
"7: //asum_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Lasum_kernel_L999 \n" | |||
" ble 9f //asum_kernel_L999 \n" | |||
".Lasum_kernel_S10: \n" | |||
"8: //asum_kernel_S10: \n" | |||
" "KERNEL_S1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Lasum_kernel_S10 \n" | |||
" bne 8b //asum_kernel_S10 \n" | |||
".Lasum_kernel_L999: \n" | |||
"9: //asum_kernel_L999: \n" | |||
" fmov %[ASUM_], "SUMF" \n" | |||
: [ASUM_] "=r" (asum) //%0 | |||
@@ -218,61 +218,61 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON | |||
" fmov d6, xzr \n" | |||
" fmov d7, xzr \n" | |||
" cmp "N", xzr \n" | |||
" ble .Ldot_kernel_L999 \n" | |||
" ble 9f //dot_kernel_L999 \n" | |||
" cmp "INC_X", #1 \n" | |||
" bne .Ldot_kernel_S_BEGIN \n" | |||
" bne 5f //dot_kernel_S_BEGIN \n" | |||
" cmp "INC_Y", #1 \n" | |||
" bne .Ldot_kernel_S_BEGIN \n" | |||
" bne 5f //dot_kernel_S_BEGIN \n" | |||
".Ldot_kernel_F_BEGIN: \n" | |||
"1: //dot_kernel_F_BEGIN: \n" | |||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
" asr "J", "N", #"N_DIV_SHIFT" \n" | |||
" cmp "J", xzr \n" | |||
" beq .Ldot_kernel_F1 \n" | |||
" beq 3f //dot_kernel_F1 \n" | |||
" .align 5 \n" | |||
".Ldot_kernel_F: \n" | |||
"2: //dot_kernel_F: \n" | |||
" "KERNEL_F" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_F \n" | |||
" bne 2b //dot_kernel_F \n" | |||
" "KERNEL_F_FINALIZE" \n" | |||
".Ldot_kernel_F1: \n" | |||
"3: //dot_kernel_F1: \n" | |||
" ands "J", "N", #"N_REM_MASK" \n" | |||
" ble .Ldot_kernel_L999 \n" | |||
" ble 9f //dot_kernel_L999 \n" | |||
".Ldot_kernel_F10: \n" | |||
"4: //dot_kernel_F10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_F10 \n" | |||
" b .Ldot_kernel_L999 \n" | |||
" bne 4b //dot_kernel_F10 \n" | |||
" b 9f //dot_kernel_L999 \n" | |||
".Ldot_kernel_S_BEGIN: \n" | |||
"5: //dot_kernel_S_BEGIN: \n" | |||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
" asr "J", "N", #2 \n" | |||
" cmp "J", xzr \n" | |||
" ble .Ldot_kernel_S1 \n" | |||
" ble 7f //dot_kernel_S1 \n" | |||
".Ldot_kernel_S4: \n" | |||
"6: //dot_kernel_S4: \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_S4 \n" | |||
" bne 6b //dot_kernel_S4 \n" | |||
".Ldot_kernel_S1: \n" | |||
"7: //dot_kernel_S1: \n" | |||
" ands "J", "N", #3 \n" | |||
" ble .Ldot_kernel_L999 \n" | |||
" ble 9f //dot_kernel_L999 \n" | |||
".Ldot_kernel_S10: \n" | |||
"8: //dot_kernel_S10: \n" | |||
" "KERNEL_F1" \n" | |||
" subs "J", "J", #1 \n" | |||
" bne .Ldot_kernel_S10 \n" | |||
" bne 8b //dot_kernel_S10 \n" | |||
".Ldot_kernel_L999: \n" | |||
"9: //dot_kernel_L999: \n" | |||
" str "DOTF", [%[DOTR_]] \n" | |||
" str "DOTI", [%[DOTI_]] \n" | |||
@@ -91,16 +91,15 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
#endif | |||
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
FLOAT _Complex result; | |||
FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; | |||
if ( n <= 0 ) | |||
{ | |||
result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; | |||
OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; | |||
return(result); | |||
} | |||
@@ -160,11 +159,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||
} | |||
#if !defined(CONJ) | |||
result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; | |||
OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; | |||
// CREAL(result) = dot[0] - dot[1]; | |||
// CIMAG(result) = dot[4] + dot[5]; | |||
#else | |||
result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; | |||
OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; | |||
// CREAL(result) = dot[0] + dot[1]; | |||
// CIMAG(result) = dot[4] - dot[5]; | |||
@@ -86,18 +86,17 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
#endif | |||
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
BLASLONG i; | |||
BLASLONG ix,iy; | |||
FLOAT _Complex result; | |||
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||
if ( n <= 0 ) | |||
{ | |||
// CREAL(result) = 0.0 ; | |||
// CIMAG(result) = 0.0 ; | |||
result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
return(result); | |||
} | |||
@@ -151,11 +150,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||
} | |||
#if !defined(CONJ) | |||
result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); | |||
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); | |||
// CREAL(result) = dot[0] - dot[1]; | |||
// CIMAG(result) = dot[2] + dot[3]; | |||
#else | |||
result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); | |||
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); | |||
// CREAL(result) = dot[0] + dot[1]; | |||
// CIMAG(result) = dot[2] - dot[3]; | |||
@@ -59,7 +59,7 @@ typedef int blasint; | |||
extension since version 3.0. If neither are available, use a compatible | |||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) | |||
(__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) | |||
#define OPENBLAS_COMPLEX_C99 | |||
#ifndef __cplusplus | |||
#include <complex.h> | |||
@@ -1,10 +1,14 @@ | |||
include_directories(${PROJECT_SOURCE_DIR}) | |||
include_directories(${PROJECT_BINARY_DIR}) | |||
set(OpenBLAS_utest_src | |||
utest_main.c | |||
test_amax.c | |||
if (MSVC AND "${CMAKE_C_COMPILER_ID}" MATCHES Clang) | |||
set(OpenBLAS_utest_src utest_main2.c) | |||
else () | |||
set(OpenBLAS_utest_src | |||
utest_main.c | |||
test_amax.c | |||
) | |||
endif () | |||
if (NOT NO_LAPACK) | |||
set(OpenBLAS_utest_src | |||
@@ -36,7 +40,7 @@ endforeach() | |||
if (MSVC) | |||
add_custom_command(TARGET ${OpenBLAS_utest_bin} | |||
POST_BUILD | |||
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/$<CONFIG>/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. | |||
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. | |||
) | |||
endif() | |||
@@ -0,0 +1,61 @@ | |||
/***************************************************************************** | |||
Copyright (c) 2011-2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
#include <stdio.h> | |||
#define CTEST_MAIN | |||
#define CTEST_SEGFAULT | |||
#define CTEST_ADD_TESTS_MANUALLY | |||
#include "openblas_utest.h" | |||
CTEST(amax, samax){ | |||
blasint N=3, inc=1; | |||
float te_max=0.0, tr_max=0.0; | |||
float x[]={-1.1, 2.2, -3.3}; | |||
te_max=BLASFUNC(samax)(&N, x, &inc); | |||
tr_max=3.3; | |||
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); | |||
} | |||
int main(int argc, const char ** argv){ | |||
CTEST_ADD(amax, samax); | |||
int num_fail=0; | |||
num_fail=ctest_main(argc, argv); | |||
return num_fail; | |||
} | |||