Misc. typo fixes in comments and documentationtags/v0.3.7
@@ -181,17 +181,17 @@ NO_AFFINITY = 1 | |||||
# time out to improve performance. This number should be from 4 to 30 | # time out to improve performance. This number should be from 4 to 30 | ||||
# which corresponds to (1 << n) cycles. For example, if you set to 26, | # which corresponds to (1 << n) cycles. For example, if you set to 26, | ||||
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz | # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz | ||||
# system). Also you can control this mumber by THREAD_TIMEOUT | |||||
# system). Also you can control this number by THREAD_TIMEOUT | |||||
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26 | # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 | ||||
# Using special device driver for mapping physically contigous memory | |||||
# Using special device driver for mapping physically contiguous memory | |||||
# to the user space. If bigphysarea is enabled, it will use it. | # to the user space. If bigphysarea is enabled, it will use it. | ||||
# DEVICEDRIVER_ALLOCATION = 1 | # DEVICEDRIVER_ALLOCATION = 1 | ||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only). | # If you need to synchronize FP CSR between threads (for x86/x86_64 only). | ||||
# CONSISTENT_FPCSR = 1 | # CONSISTENT_FPCSR = 1 | ||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||||
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute | |||||
# with single thread. (Actually in recent versions this is a factor proportional to the | # with single thread. (Actually in recent versions this is a factor proportional to the | ||||
# number of floating point operations necessary for the given problem size, no longer | # number of floating point operations necessary for the given problem size, no longer | ||||
# an individual dimension). You can use this setting to avoid the overhead of multi- | # an individual dimension). You can use this setting to avoid the overhead of multi- | ||||
@@ -133,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt`. | |||||
#### PPC/PPC64 | #### PPC/PPC64 | ||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` | |||||
- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` | |||||
#### IBM zEnterprise System | #### IBM zEnterprise System | ||||
@@ -1,7 +1,7 @@ | |||||
# helper functions for the kernel CMakeLists.txt | # helper functions for the kernel CMakeLists.txt | ||||
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. | |||||
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. | |||||
macro(SetDefaultL1) | macro(SetDefaultL1) | ||||
set(SAMAXKERNEL amax.S) | set(SAMAXKERNEL amax.S) | ||||
set(DAMAXKERNEL amax.S) | set(DAMAXKERNEL amax.S) | ||||
@@ -283,7 +283,7 @@ endif () | |||||
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | ||||
# TODO: nead to convert these Makefiles | |||||
# TODO: need to convert these Makefiles | |||||
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | ||||
if (${CORE} STREQUAL "PPC440") | if (${CORE} STREQUAL "PPC440") | ||||
@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) | |||||
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) | set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) | ||||
endfunction () | endfunction () | ||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition | |||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition | |||||
# @param sources_in the source files to build from | # @param sources_in the source files to build from | ||||
# @param defines_in (optional) preprocessor definitions that will be applied to all objects | # @param defines_in (optional) preprocessor definitions that will be applied to all objects | ||||
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. | # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. | ||||
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
* SIZE must be carefully chosen to be: | * SIZE must be carefully chosen to be: | ||||
* - as small as possible to maximize the number of stack allocation | * - as small as possible to maximize the number of stack allocation | ||||
* - large enough to support all architectures and kernel | * - large enough to support all architectures and kernel | ||||
* Chosing a too small SIZE will lead to a stack smashing. | |||||
* Choosing a SIZE too small will lead to a stack smashing. | |||||
*/ | */ | ||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | ||||
/* make it volatile because some function (ex: dgemv_n.S) */ \ | /* make it volatile because some function (ex: dgemv_n.S) */ \ | ||||
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
#endif | #endif | ||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
//Enable some optimazation for barcelona. | |||||
//Enable some optimization for barcelona. | |||||
#define BARCELONA_OPTIMIZATION | #define BARCELONA_OPTIMIZATION | ||||
#endif | #endif | ||||
@@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
#ifdef ASSEMBLER | #ifdef ASSEMBLER | ||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
//Enable some optimazation for barcelona. | |||||
//Enable some optimization for barcelona. | |||||
#define BARCELONA_OPTIMIZATION | #define BARCELONA_OPTIMIZATION | ||||
#endif | #endif | ||||
@@ -577,7 +577,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -653,7 +653,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -653,7 +653,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -577,7 +577,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); | |||||
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ | /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ | ||||
/* jobs is queued. */ | /* jobs is queued. */ | ||||
/* We need this grobal for cheking if initialization is finished. */ | |||||
/* We need this global for checking if initialization is finished. */ | |||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | ||||
/* Local Variables */ | /* Local Variables */ | ||||
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); | |||||
#ifdef MONITOR | #ifdef MONITOR | ||||
/* Monitor is a function to see thread's status for every seconds. */ | |||||
/* Usually it turns off and it's for debugging. */ | |||||
/* Monitor is a function to see thread's status for every second. */ | |||||
/* Usually it turns off and it's for debugging. */ | |||||
static pthread_t monitor_thread; | static pthread_t monitor_thread; | ||||
static int main_status[MAX_CPU_NUMBER]; | static int main_status[MAX_CPU_NUMBER]; | ||||
@@ -50,7 +50,7 @@ | |||||
/* This is a thread implementation for Win32 lazy implementation */ | /* This is a thread implementation for Win32 lazy implementation */ | ||||
/* Thread server common infomation */ | |||||
/* Thread server common information */ | |||||
typedef struct{ | typedef struct{ | ||||
CRITICAL_SECTION lock; | CRITICAL_SECTION lock; | ||||
HANDLE filled; | HANDLE filled; | ||||
@@ -61,7 +61,7 @@ typedef struct{ | |||||
} blas_pool_t; | } blas_pool_t; | ||||
/* We need this global for cheking if initialization is finished. */ | |||||
/* We need this global for checking if initialization is finished. */ | |||||
int blas_server_avail = 0; | int blas_server_avail = 0; | ||||
/* Local Variables */ | /* Local Variables */ | ||||
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { | |||||
int mynode = 1; | int mynode = 1; | ||||
/* if number of threads is larger than inital condition */ | |||||
/* if number of threads is larger than initial condition */ | |||||
if (pos < 0) { | if (pos < 0) { | ||||
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); | sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); | ||||
return 0; | return 0; | ||||
@@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){ | |||||
#ifdef ALLOC_DEVICEDRIVER | #ifdef ALLOC_DEVICEDRIVER | ||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | ||||
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); | |||||
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); | |||||
} | } | ||||
#endif | #endif | ||||
@@ -125,7 +125,7 @@ if ($compiler eq "") { | |||||
$openmp = "-openmp"; | $openmp = "-openmp"; | ||||
} | } | ||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores. | |||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | ||||
if ($data =~ / zho_ge__/) { | if ($data =~ / zho_ge__/) { | ||||
$need2bu = 1; | $need2bu = 1; | ||||
@@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES | |||||
axpby.c | axpby.c | ||||
) | ) | ||||
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f | |||||
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f | |||||
# these all have 'z' sources for complex versions | # these all have 'z' sources for complex versions | ||||
set(BLAS2_SOURCES | set(BLAS2_SOURCES | ||||
gemv.c ger.c | gemv.c ger.c | ||||
@@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||||
//disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
//In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
// | // | ||||
//Temporarily work-around the low performance issue with small imput size & | |||||
//Temporarily work-around the low performance issue with small input size & | |||||
//multithreads. | //multithreads. | ||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | ||||
nthreads = 1; | nthreads = 1; | ||||
@@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||||
//disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
//In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
// | // | ||||
//Temporarily work-around the low performance issue with small imput size & | |||||
//Temporarily work-around the low performance issue with small input size & | |||||
//multithreads. | //multithreads. | ||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | ||||
nthreads = 1; | nthreads = 1; | ||||
@@ -576,7 +576,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -991,7 +991,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -946,7 +946,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||
@@ -576,7 +576,7 @@ | |||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
* ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
* | * | ||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
* | * | ||||