| @@ -91,3 +91,4 @@ benchmark/*.goto | |||||
| benchmark/smallscaling | benchmark/smallscaling | ||||
| CMakeCache.txt | CMakeCache.txt | ||||
| CMakeFiles/* | CMakeFiles/* | ||||
| .vscode | |||||
| @@ -212,7 +212,8 @@ Please note that it is not possible to combine support for different architectur | |||||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | ||||
| - **AIX**: Supported on PPC up to POWER8 | - **AIX**: Supported on PPC up to POWER8 | ||||
| - **Haiku**: Supported by the community. We don't actively test the library on this OS. | - **Haiku**: Supported by the community. We don't actively test the library on this OS. | ||||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS: | |||||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS. | |||||
| - **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>. | |||||
| ## Usage | ## Usage | ||||
| @@ -30,10 +30,11 @@ environment: | |||||
| CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | ||||
| matrix: | matrix: | ||||
| - COMPILER: clang-cl | - COMPILER: clang-cl | ||||
| WITH_FORTRAN: yes | |||||
| WITH_FORTRAN: ON | |||||
| USE_OPENMP: ON | |||||
| - COMPILER: clang-cl | - COMPILER: clang-cl | ||||
| DYNAMIC_ARCH: ON | DYNAMIC_ARCH: ON | ||||
| WITH_FORTRAN: no | |||||
| WITH_FORTRAN: OFF | |||||
| - COMPILER: cl | - COMPILER: cl | ||||
| - COMPILER: MinGW64-gcc-7.2.0-mingw | - COMPILER: MinGW64-gcc-7.2.0-mingw | ||||
| DYNAMIC_ARCH: OFF | DYNAMIC_ARCH: OFF | ||||
| @@ -47,12 +48,7 @@ environment: | |||||
| install: | install: | ||||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | ||||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | ||||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake | |||||
| - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja | |||||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja | |||||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang | |||||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||||
| - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | ||||
| - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | ||||
| - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" | - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" | ||||
| @@ -68,8 +64,9 @@ before_build: | |||||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | ||||
| - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | ||||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | ||||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||||
| - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||||
| - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||||
| - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. | |||||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | ||||
| build_script: | build_script: | ||||
| @@ -148,16 +148,20 @@ endif () | |||||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | ||||
| if (DEFINED TARGET) | if (DEFINED TARGET) | ||||
| if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | ||||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | ||||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | ||||
| else() | else() | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | ||||
| endif() | endif() | ||||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||||
| # endif() | |||||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||||
| else() | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||||
| endif() | |||||
| endif() | |||||
| endif() | endif() | ||||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | ||||
| @@ -233,6 +237,11 @@ if (BINARY64) | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if(EMBEDDED) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") | |||||
| endif() | |||||
| if (NEED_PIC) | if (NEED_PIC) | ||||
| if (${CMAKE_C_COMPILER} STREQUAL "IBM") | if (${CMAKE_C_COMPILER} STREQUAL "IBM") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") | set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") | ||||
| @@ -122,7 +122,7 @@ extern "C" { | |||||
| #define ATOM GOTO_ATOM | #define ATOM GOTO_ATOM | ||||
| #undef GOTO_ATOM | #undef GOTO_ATOM | ||||
| #endif | #endif | ||||
| #else | |||||
| #elif !defined(OS_EMBEDDED) | |||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||
| #ifndef NO_SYSV_IPC | #ifndef NO_SYSV_IPC | ||||
| #include <sys/shm.h> | #include <sys/shm.h> | ||||
| @@ -134,6 +134,9 @@ extern "C" { | |||||
| #if defined(SMP) || defined(USE_LOCKING) | #if defined(SMP) || defined(USE_LOCKING) | ||||
| #include <pthread.h> | #include <pthread.h> | ||||
| #endif | #endif | ||||
| #else | |||||
| #include <time.h> | |||||
| #include <math.h> | |||||
| #endif | #endif | ||||
| #if defined(OS_SUNOS) | #if defined(OS_SUNOS) | ||||
| @@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){ | |||||
| struct timespec ts; | struct timespec ts; | ||||
| clock_gettime(CLOCK_MONOTONIC, &ts); | clock_gettime(CLOCK_MONOTONIC, &ts); | ||||
| return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; | return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; | ||||
| #else | |||||
| #elif !defined(OS_EMBEDDED) | |||||
| struct timeval tv; | struct timeval tv; | ||||
| gettimeofday(&tv,NULL); | gettimeofday(&tv,NULL); | ||||
| return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; | return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; | ||||
| #else | |||||
| return 0; | |||||
| #endif | #endif | ||||
| } | } | ||||
| #define RPCC_DEFINED | #define RPCC_DEFINED | ||||
| @@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||||
| #include "common_linux.h" | #include "common_linux.h" | ||||
| #endif | #endif | ||||
| #ifdef OS_EMBEDDED | |||||
| #define DTB_DEFAULT_ENTRIES 64 | |||||
| #endif | |||||
| #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | ||||
| #ifdef __NetBSD__ | #ifdef __NetBSD__ | ||||
| @@ -1668,16 +1668,23 @@ void gotoblas_dummy_for_PGI(void) { | |||||
| #ifndef MEM_LARGE_PAGES | #ifndef MEM_LARGE_PAGES | ||||
| #define MEM_LARGE_PAGES 0x20000000 | #define MEM_LARGE_PAGES 0x20000000 | ||||
| #endif | #endif | ||||
| #else | |||||
| #elif !defined(OS_EMBEDDED) | |||||
| #define ALLOC_MMAP | #define ALLOC_MMAP | ||||
| #define ALLOC_MALLOC | #define ALLOC_MALLOC | ||||
| #else | |||||
| #define ALLOC_MALLOC | |||||
| inline int puts(const char *str) { return 0; } | |||||
| inline int printf(const char *format, ...) { return 0; } | |||||
| inline char *getenv(const char *name) { return ""; } | |||||
| inline int atoi(const char *str) { return 0; } | |||||
| #endif | #endif | ||||
| #include <stdlib.h> | #include <stdlib.h> | ||||
| #include <stdio.h> | #include <stdio.h> | ||||
| #include <fcntl.h> | #include <fcntl.h> | ||||
| #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||||
| #if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) | |||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||
| #ifndef NO_SYSV_IPC | #ifndef NO_SYSV_IPC | ||||
| #include <sys/shm.h> | #include <sys/shm.h> | ||||
| @@ -1634,10 +1634,10 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c | |||||
| cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c | |||||
| cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | ||||
| cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c | |||||
| cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | ||||
| cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | ||||
| @@ -0,0 +1,176 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||||
| { | |||||
| __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "xscvdpspn 32, %x3 \n\t" | |||||
| "xxspltw 32, 32, 0 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||||
| "xvmulsp 49, 41, 32 \n\t" | |||||
| "xvmulsp 50, 42, 32 \n\t" | |||||
| "xvmulsp 51, 43, 32 \n\t" | |||||
| "xvmulsp 52, 44, 32 \n\t" | |||||
| "xvmulsp 53, 45, 32 \n\t" | |||||
| "xvmulsp 54, 46, 32 \n\t" | |||||
| "xvmulsp 55, 47, 32 \n\t" | |||||
| "xxperm 34, 40, %x5 \n\t" | |||||
| "xxperm 35, 41, %x5 \n\t" | |||||
| "xxperm 36, 42, %x5 \n\t" | |||||
| "xxperm 37, 43, %x5 \n\t" | |||||
| "xxperm 38, 44, %x5 \n\t" | |||||
| "xxperm 39, 45, %x5 \n\t" | |||||
| "xxperm 56, 46, %x5 \n\t" | |||||
| "xxperm 57, 47, %x5 \n\t" | |||||
| "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||||
| "xvmulsp 35, 35, %x4 \n\t" | |||||
| "lxvp 40, 128(%2) \n\t" | |||||
| "xvmulsp 36, 36, %x4 \n\t" | |||||
| "xvmulsp 37, 37, %x4 \n\t" | |||||
| "lxvp 42, 160(%2) \n\t" | |||||
| "xvmulsp 38, 38, %x4 \n\t" | |||||
| "xvmulsp 39, 39, %x4 \n\t" | |||||
| "lxvp 44, 192(%2) \n\t" | |||||
| "xvmulsp 56, 56, %x4 \n\t" | |||||
| "xvmulsp 57, 57, %x4 \n\t" | |||||
| "lxvp 46, 224(%2) \n\t" | |||||
| "xvaddsp 48, 48, 34 \n\t" | |||||
| "xvaddsp 49, 49, 35 \n\t" | |||||
| "xvaddsp 50, 50, 36 \n\t" | |||||
| "xvaddsp 51, 51, 37 \n\t" | |||||
| "stxvp 48, 0(%2) \n\t" | |||||
| "xvaddsp 52, 52, 38 \n\t" | |||||
| "xvaddsp 53, 53, 39 \n\t" | |||||
| "stxvp 50, 32(%2) \n\t" | |||||
| "xvaddsp 54, 54, 56 \n\t" | |||||
| "xvaddsp 55, 55, 57 \n\t" | |||||
| "stxvp 52, 64(%2) \n\t" | |||||
| "stxvp 54, 96(%2) \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||||
| "xvmulsp 49, 41, 32 \n\t" | |||||
| "xvmulsp 50, 42, 32 \n\t" | |||||
| "xvmulsp 51, 43, 32 \n\t" | |||||
| "xvmulsp 52, 44, 32 \n\t" | |||||
| "xvmulsp 53, 45, 32 \n\t" | |||||
| "xvmulsp 54, 46, 32 \n\t" | |||||
| "xvmulsp 55, 47, 32 \n\t" | |||||
| "xxperm 34, 40, %x5 \n\t" | |||||
| "xxperm 35, 41, %x5 \n\t" | |||||
| "xxperm 36, 42, %x5 \n\t" | |||||
| "xxperm 37, 43, %x5 \n\t" | |||||
| "xxperm 38, 44, %x5 \n\t" | |||||
| "xxperm 39, 45, %x5 \n\t" | |||||
| "xxperm 56, 46, %x5 \n\t" | |||||
| "xxperm 57, 47, %x5 \n\t" | |||||
| "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||||
| "xvmulsp 35, 35, %x4 \n\t" | |||||
| "xvmulsp 36, 36, %x4 \n\t" | |||||
| "xvmulsp 37, 37, %x4 \n\t" | |||||
| "xvmulsp 38, 38, %x4 \n\t" | |||||
| "xvmulsp 39, 39, %x4 \n\t" | |||||
| "xvmulsp 56, 56, %x4 \n\t" | |||||
| "xvmulsp 57, 57, %x4 \n\t" | |||||
| "xvaddsp 48, 48, 34 \n\t" | |||||
| "xvaddsp 49, 49, 35 \n\t" | |||||
| "xvaddsp 50, 50, 36 \n\t" | |||||
| "xvaddsp 51, 51, 37 \n\t" | |||||
| "stxvp 48, 0(%2) \n\t" | |||||
| "xvaddsp 52, 52, 38 \n\t" | |||||
| "xvaddsp 53, 53, 39 \n\t" | |||||
| "stxvp 50, 32(%2) \n\t" | |||||
| "xvaddsp 54, 54, 56 \n\t" | |||||
| "xvaddsp 55, 55, 57 \n\t" | |||||
| "stxvp 52, 64(%2) \n\t" | |||||
| "stxvp 54, 96(%2) \n\t" | |||||
| "#n=%1 x=%0=%2 alpha=(%3,%4)\n" | |||||
| : | |||||
| "+m" (*x), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x) // 2 | |||||
| : | |||||
| "f" (alpha_r), // 3 | |||||
| "wa" (t0), // 4 | |||||
| "wa" (mask) // 5 | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57" | |||||
| ); | |||||
| } | |||||
| @@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||||
| #if defined(__VEC__) || defined(__ALTIVEC__) | #if defined(__VEC__) || defined(__ALTIVEC__) | ||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #include "zscal_microk_power8.c" | #include "zscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| #elif defined(POWER10) | |||||
| #if defined(DOUBLE) | |||||
| #include "zscal_microk_power8.c" | |||||
| #else | |||||
| #include "cscal_microk_power10.c" | |||||
| #endif | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||||
| { | { | ||||
| #if defined(DOUBLE) | |||||
| n1 = n & -8; | n1 = n & -8; | ||||
| #else | |||||
| n1 = n & -16; | |||||
| #endif | |||||
| if ( n1 > 0 ) | if ( n1 > 0 ) | ||||
| { | { | ||||
| zscal_kernel_8(n1, x, da_r, da_i); | zscal_kernel_8(n1, x, da_r, da_i); | ||||
| @@ -320,12 +320,13 @@ | |||||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | ||||
| COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | ||||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | ||||
| $ U12, X | |||||
| $ U12, X, ABI12, Y | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| COMPLEX CLADIV | |||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| REAL CLANHS, SLAMCH | REAL CLANHS, SLAMCH | ||||
| EXTERNAL LSAME, CLANHS, SLAMCH | |||||
| EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH | |||||
| * .. | * .. | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA | ||||
| @@ -729,15 +730,21 @@ | |||||
| AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | ||||
| $ ( BSCALE*T( ILAST, ILAST ) ) | $ ( BSCALE*T( ILAST, ILAST ) ) | ||||
| ABI22 = AD22 - U12*AD21 | ABI22 = AD22 - U12*AD21 | ||||
| ABI12 = AD12 - U12*AD11 | |||||
| * | * | ||||
| T1 = HALF*( AD11+ABI22 ) | |||||
| RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) | |||||
| TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) + | |||||
| $ AIMAG( T1-ABI22 )*AIMAG( RTDISC ) | |||||
| IF( TEMP.LE.ZERO ) THEN | |||||
| SHIFT = T1 + RTDISC | |||||
| ELSE | |||||
| SHIFT = T1 - RTDISC | |||||
| SHIFT = ABI22 | |||||
| CTEMP = SQRT( ABI12 )*SQRT( AD21 ) | |||||
| TEMP = ABS1( CTEMP ) | |||||
| IF( CTEMP.NE.ZERO ) THEN | |||||
| X = HALF*( AD11-SHIFT ) | |||||
| TEMP2 = ABS1( X ) | |||||
| TEMP = MAX( TEMP, ABS1( X ) ) | |||||
| Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) | |||||
| IF( TEMP2.GT.ZERO ) THEN | |||||
| IF( REAL( X / TEMP2 )*REAL( Y )+ | |||||
| $ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y | |||||
| END IF | |||||
| SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) | |||||
| END IF | END IF | ||||
| ELSE | ELSE | ||||
| * | * | ||||
| @@ -320,12 +320,13 @@ | |||||
| $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP | ||||
| COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, | ||||
| $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, | ||||
| $ U12, X | |||||
| $ U12, X, ABI12, Y | |||||
| * .. | * .. | ||||
| * .. External Functions .. | * .. External Functions .. | ||||
| COMPLEX*16 ZLADIV | |||||
| LOGICAL LSAME | LOGICAL LSAME | ||||
| DOUBLE PRECISION DLAMCH, ZLANHS | DOUBLE PRECISION DLAMCH, ZLANHS | ||||
| EXTERNAL LSAME, DLAMCH, ZLANHS | |||||
| EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS | |||||
| * .. | * .. | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL | EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL | ||||
| @@ -730,15 +731,21 @@ | |||||
| AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | AD22 = ( ASCALE*H( ILAST, ILAST ) ) / | ||||
| $ ( BSCALE*T( ILAST, ILAST ) ) | $ ( BSCALE*T( ILAST, ILAST ) ) | ||||
| ABI22 = AD22 - U12*AD21 | ABI22 = AD22 - U12*AD21 | ||||
| ABI12 = AD12 - U12*AD11 | |||||
| * | * | ||||
| T1 = HALF*( AD11+ABI22 ) | |||||
| RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) | |||||
| TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) + | |||||
| $ DIMAG( T1-ABI22 )*DIMAG( RTDISC ) | |||||
| IF( TEMP.LE.ZERO ) THEN | |||||
| SHIFT = T1 + RTDISC | |||||
| ELSE | |||||
| SHIFT = T1 - RTDISC | |||||
| SHIFT = ABI22 | |||||
| CTEMP = SQRT( ABI12 )*SQRT( AD21 ) | |||||
| TEMP = ABS1( CTEMP ) | |||||
| IF( CTEMP.NE.ZERO ) THEN | |||||
| X = HALF*( AD11-SHIFT ) | |||||
| TEMP2 = ABS1( X ) | |||||
| TEMP = MAX( TEMP, ABS1( X ) ) | |||||
| Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) | |||||
| IF( TEMP2.GT.ZERO ) THEN | |||||
| IF( DBLE( X / TEMP2 )*DBLE( Y )+ | |||||
| $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y | |||||
| END IF | |||||
| SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) ) | |||||
| END IF | END IF | ||||
| ELSE | ELSE | ||||
| * | * | ||||