@@ -1,8 +1,13 @@ | |||||
*.obj | |||||
*.lib | |||||
*.dll | |||||
*.def | |||||
*.o | *.o | ||||
lapack-3.1.1 | lapack-3.1.1 | ||||
lapack-3.1.1.tgz | lapack-3.1.1.tgz | ||||
*.so | *.so | ||||
*.a | *.a | ||||
.svn | |||||
*~ | *~ | ||||
config.h | config.h | ||||
Makefile.conf | Makefile.conf | ||||
@@ -1,13 +1,40 @@ | |||||
OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
==================================================================== | ==================================================================== | ||||
Version 0.1 alpha2(in development) | |||||
Version 0.1 alpha2 | |||||
23-Jun-2011 | |||||
common: | common: | ||||
* | |||||
* Fixed blasint undefined bug in <cblas.h> file. Other software | |||||
could include this header successfully(Refs issue #13 on github) | |||||
* Fixed the SEGFAULT bug on 64 cores. On SMP server, the number | |||||
of CPUs or cores should be less than or equal to 64.(Refs issue #14 | |||||
on github) | |||||
* Support "void goto_set_num_threads(int num_threads)" and "void | |||||
openblas_set_num_threads(int num_threads)" when USE_OPENMP=1 | |||||
* Added extern "C" to support C++. Thank Tasio for the patch(Refs | |||||
issue #21 on github) | |||||
* Provided an error message when the arch is not supported.(Refs | |||||
issue #19 on github) | |||||
* Fixed issue #23. Fixed a bug of f_check script about generating link flags. | |||||
* Added openblas_set_num_threads for Fortran. | |||||
* Fixed #25 a wrong result of rotmg. | |||||
* Fixed a bug about detecting underscore prefix in c_check. | |||||
* Print the wall time (cycles) with enabling FUNCTION_PROFILE | |||||
* Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1 | |||||
* Added install target. You can use "make install". (Refs #20) | |||||
x86/x86_64: | x86/x86_64: | ||||
* | |||||
* Fixed #28 a wrong result of dsdot on x86_64. | |||||
* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6. | |||||
* Fixed #33 ztrmm bug on Nehalem. | |||||
* Walk round #27 the low performance axpy issue with small imput size & multithreads. | |||||
MIPS64: | MIPS64: | ||||
* | |||||
* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. | |||||
* Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2) | |||||
* Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3) | |||||
==================================================================== | ==================================================================== | ||||
Version 0.1 alpha1 | Version 0.1 alpha1 | ||||
20-Mar-2011 | 20-Mar-2011 | ||||
@@ -15,6 +15,10 @@ ifdef SANITY_CHECK | |||||
BLASDIRS += reference | BLASDIRS += reference | ||||
endif | endif | ||||
ifndef PREFIX | |||||
PREFIX = /opt/OpenBLAS | |||||
endif | |||||
SUBDIRS = $(BLASDIRS) | SUBDIRS = $(BLASDIRS) | ||||
ifneq ($(NO_LAPACK), 1) | ifneq ($(NO_LAPACK), 1) | ||||
SUBDIRS += lapack | SUBDIRS += lapack | ||||
@@ -22,8 +26,8 @@ endif | |||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | ||||
.PHONY : all libs netlib test ctest shared | |||||
.NOTPARALLEL : all libs prof lapack-test | |||||
.PHONY : all libs netlib test ctest shared install | |||||
.NOTPARALLEL : all libs prof lapack-test install | |||||
all :: libs netlib tests shared | all :: libs netlib tests shared | ||||
@echo | @echo | ||||
@@ -70,7 +74,7 @@ ifeq ($(OSNAME), Darwin) | |||||
endif | endif | ||||
ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
$(MAKE) -C exports dll | $(MAKE) -C exports dll | ||||
# -ln -fs $(LIBDLLNAME) libopenblas.dll | |||||
-ln -fs $(LIBDLLNAME) libopenblas.dll | |||||
endif | endif | ||||
ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
$(MAKE) -C exports dll | $(MAKE) -C exports dll | ||||
@@ -96,18 +100,26 @@ endif | |||||
endif | endif | ||||
libs : | libs : | ||||
ifeq ($(CORE), UNKOWN) | |||||
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) | |||||
endif | |||||
-ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) | -ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX) | ||||
for d in $(SUBDIRS) ; \ | for d in $(SUBDIRS) ; \ | ||||
do if test -d $$d; then \ | do if test -d $$d; then \ | ||||
$(MAKE) -C $$d $(@F) || exit 1 ; \ | $(MAKE) -C $$d $(@F) || exit 1 ; \ | ||||
fi; \ | fi; \ | ||||
done | done | ||||
#Save the config files for installation | |||||
cp Makefile.conf Makefile.conf_last | |||||
cp config.h config_last.h | |||||
ifdef DYNAMIC_ARCH | ifdef DYNAMIC_ARCH | ||||
$(MAKE) -C kernel commonlibs || exit 1 | $(MAKE) -C kernel commonlibs || exit 1 | ||||
for d in $(DYNAMIC_CORE) ; \ | for d in $(DYNAMIC_CORE) ; \ | ||||
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ | ||||
done | done | ||||
echo DYNAMIC_ARCH=1 >> Makefile.conf_last | |||||
endif | endif | ||||
touch lib.grd | |||||
prof : prof_blas prof_lapack | prof : prof_blas prof_lapack | ||||
@@ -227,19 +239,23 @@ lapack-test : | |||||
dummy : | dummy : | ||||
install : | |||||
$(MAKE) -f Makefile.install install | |||||
clean :: | clean :: | ||||
@for d in $(SUBDIRS_ALL) ; \ | @for d in $(SUBDIRS_ALL) ; \ | ||||
do if test -d $$d; then \ | do if test -d $$d; then \ | ||||
$(MAKE) -C $$d $(@F) || exit 1 ; \ | $(MAKE) -C $$d $(@F) || exit 1 ; \ | ||||
fi; \ | fi; \ | ||||
done | done | ||||
ifdef DYNAMIC_ARCH | |||||
#ifdef DYNAMIC_ARCH | |||||
@$(MAKE) -C kernel clean | @$(MAKE) -C kernel clean | ||||
endif | |||||
#endif | |||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h | @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h | ||||
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | ||||
@if test -d lapack-3.1.1; then \ | @if test -d lapack-3.1.1; then \ | ||||
echo deleting lapack-3.1.1; \ | echo deleting lapack-3.1.1; \ | ||||
rm -rf lapack-3.1.1 ;\ | rm -rf lapack-3.1.1 ;\ | ||||
fi | fi | ||||
@rm -f *.grd Makefile.conf_last config_last.h | |||||
@echo Done. | @echo Done. |
@@ -0,0 +1,65 @@ | |||||
TOPDIR = . | |||||
export GOTOBLAS_MAKEFILE = 1 | |||||
-include $(TOPDIR)/Makefile.conf_last | |||||
include ./Makefile.system | |||||
.PHONY : install | |||||
.NOTPARALLEL : install | |||||
lib.grd : | |||||
$(error OpenBLAS: Please run "make" firstly) | |||||
install : lib.grd | |||||
@-mkdir -p $(PREFIX) | |||||
@echo Generating openblas_config.h in $(PREFIX) | |||||
#for inc | |||||
@echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h | |||||
@echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h | |||||
@cat config_last.h >> $(PREFIX)/openblas_config.h | |||||
@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h | |||||
@cat openblas_config_template.h >> $(PREFIX)/openblas_config.h | |||||
@echo \#endif >> $(PREFIX)/openblas_config.h | |||||
@echo Generating f77blas.h in $(PREFIX) | |||||
@echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h | |||||
@echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h | |||||
@echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h | |||||
@cat common_interface.h >> $(PREFIX)/f77blas.h | |||||
@echo \#endif >> $(PREFIX)/f77blas.h | |||||
@echo Generating cblas.h in $(PREFIX) | |||||
@sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h | |||||
#for install static library | |||||
@echo Copy the static library to $(PREFIX) | |||||
@cp $(LIBNAME) $(PREFIX) | |||||
@-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX) | |||||
#for install shared library | |||||
@echo Copy the shared library to $(PREFIX) | |||||
ifeq ($(OSNAME), Linux) | |||||
-cp $(LIBSONAME) $(PREFIX) | |||||
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so | |||||
endif | |||||
ifeq ($(OSNAME), FreeBSD) | |||||
-cp $(LIBSONAME) $(PREFIX) | |||||
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so | |||||
endif | |||||
ifeq ($(OSNAME), NetBSD) | |||||
-cp $(LIBSONAME) $(PREFIX) | |||||
-ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so | |||||
endif | |||||
ifeq ($(OSNAME), Darwin) | |||||
-cp $(LIBDYNNAME) $(PREFIX) | |||||
-ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib | |||||
endif | |||||
ifeq ($(OSNAME), WINNT) | |||||
-cp $(LIBDLLNAME) $(PREFIX) | |||||
-ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll | |||||
endif | |||||
ifeq ($(OSNAME), CYGWIN_NT) | |||||
-cp $(LIBDLLNAME) $(PREFIX) | |||||
-ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll | |||||
endif | |||||
@echo Install OK! | |||||
@@ -91,6 +91,9 @@ VERSION = 0.1alpha2 | |||||
# SANITY_CHECK to compare the result with reference BLAS. | # SANITY_CHECK to compare the result with reference BLAS. | ||||
# UTEST_CHECK = 1 | # UTEST_CHECK = 1 | ||||
# The installation directory. | |||||
# PREFIX = /opt/OpenBLAS | |||||
# Common Optimization Flag; -O2 is enough. | # Common Optimization Flag; -O2 is enough. | ||||
# DEBUG = 1 | # DEBUG = 1 | ||||
@@ -30,6 +30,10 @@ ifdef TARGET | |||||
GETARCH_FLAGS += -DFORCE_$(TARGET) | GETARCH_FLAGS += -DFORCE_$(TARGET) | ||||
endif | endif | ||||
ifdef INTERFACE64 | |||||
GETARCH_FLAGS += -DUSE64BITINT | |||||
endif | |||||
# This operation is expensive, so execution should be once. | # This operation is expensive, so execution should be once. | ||||
ifndef GOTOBLAS_MAKEFILE | ifndef GOTOBLAS_MAKEFILE | ||||
export GOTOBLAS_MAKEFILE = 1 | export GOTOBLAS_MAKEFILE = 1 | ||||
@@ -185,7 +189,7 @@ ifeq ($(C_COMPILER), INTEL) | |||||
CCOMMON_OPT += -wd981 | CCOMMON_OPT += -wd981 | ||||
endif | endif | ||||
ifdef USE_OPENMP | |||||
ifeq ($(USE_OPENMP), 1) | |||||
ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
CCOMMON_OPT += -fopenmp | CCOMMON_OPT += -fopenmp | ||||
endif | endif | ||||
@@ -489,7 +493,8 @@ endif | |||||
ifdef BINARY64 | ifdef BINARY64 | ||||
ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
CCOMMON_OPT += -DUSE64BITINT | |||||
CCOMMON_OPT += | |||||
#-DUSE64BITINT | |||||
endif | endif | ||||
endif | endif | ||||
@@ -510,6 +515,10 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||||
CCOMMON_OPT += -DDYNAMIC_ARCH | CCOMMON_OPT += -DDYNAMIC_ARCH | ||||
endif | endif | ||||
ifeq ($(NO_LAPACK), 1) | |||||
CCOMMON_OPT += -DNO_LAPACK | |||||
endif | |||||
ifdef SMP | ifdef SMP | ||||
CCOMMON_OPT += -DSMP_SERVER | CCOMMON_OPT += -DSMP_SERVER | ||||
@@ -8,7 +8,9 @@ Download from project homepage. http://xianyi.github.com/OpenBLAS/ | |||||
Or, | Or, | ||||
check out codes from git://github.com/xianyi/OpenBLAS.git | check out codes from git://github.com/xianyi/OpenBLAS.git | ||||
1)Normal compile | 1)Normal compile | ||||
Please read GotoBLAS_02QuickInstall.txt or type "make" | |||||
(a) type "make" to detect the CPU automatically. | |||||
or | |||||
(b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. | |||||
2)Cross compile | 2)Cross compile | ||||
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. | ||||
@@ -20,6 +22,11 @@ make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-g | |||||
3)Debug version | 3)Debug version | ||||
make DEBUG=1 | make DEBUG=1 | ||||
4)Intall to the directory (Optional) | |||||
e.g. | |||||
make install PREFIX=your_installation_directory | |||||
The default directory is /opt/OpenBLAS | |||||
3.Support CPU & OS | 3.Support CPU & OS | ||||
Please read GotoBLAS_01Readme.txt | Please read GotoBLAS_01Readme.txt | ||||
@@ -39,13 +46,17 @@ export GOTO_NUM_THREADS=4 | |||||
or | or | ||||
export OMP_NUM_THREADS=4 | export OMP_NUM_THREADS=4 | ||||
The priorities are OPENBLAS_NUM_THREAD > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||||
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. | |||||
If you compile this lib with USE_OPENMP=1, you should only set OMP_NUM_THREADS environment variable. | |||||
4.2 Set the number of threads with calling functions. for example, | 4.2 Set the number of threads with calling functions. for example, | ||||
void goto_set_num_threads(int num_threads); | void goto_set_num_threads(int num_threads); | ||||
or | or | ||||
void openblas_set_num_threads(int num_threads); | void openblas_set_num_threads(int num_threads); | ||||
If you compile this lib with USE_OPENMP=1, you should use the above functions, too. | |||||
5.Report Bugs | 5.Report Bugs | ||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues | Please add a issue in https://github.com/xianyi/OpenBLAS/issues | ||||
@@ -56,4 +67,17 @@ Optimization on ICT Loongson 3A CPU | |||||
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas | ||||
8.ChangeLog | 8.ChangeLog | ||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. | |||||
9.Known Issues | |||||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit | |||||
is 64. On 32 bits, it is 32. | |||||
* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully. | |||||
10. Specification of Git Branches | |||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). | |||||
Now, there are 4 branches in github.com. | |||||
* The master branch. This a main branch to reflect a production-ready state. | |||||
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. | |||||
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. | |||||
* The gh-pages branch. This is for web pages |
@@ -0,0 +1,57 @@ | |||||
Force Target Examples: | |||||
make TARGET=NEHALEM | |||||
make TARGET=LOONGSON3A BINARY=64 | |||||
make TARGET=ISTANBUL | |||||
Supported List: | |||||
1.X86/X86_64 | |||||
a)Intel CPU: | |||||
P2 | |||||
COPPERMINE | |||||
KATMAI | |||||
NORTHWOOD | |||||
PRESCOTT | |||||
BANIAS | |||||
YONAH | |||||
CORE2 | |||||
PENRYN | |||||
DUNNINGTON | |||||
NEHALEM | |||||
ATOM | |||||
b)AMD CPU: | |||||
ATHLON | |||||
OPTERON | |||||
OPTERON_SSE3 | |||||
BARCELONA | |||||
SHANGHAI | |||||
ISTANBUL | |||||
c)VIA CPU: | |||||
SSE_GENERIC | |||||
VIAC3 | |||||
NANO | |||||
2.Power CPU: | |||||
POWER4 | |||||
POWER5 | |||||
POWER6 | |||||
PPCG4 | |||||
PPC970 | |||||
PPC970MP | |||||
PPC440 | |||||
PPC440FP2 | |||||
CELL | |||||
3.MIPS64 CPU: | |||||
SICORTEX | |||||
LOONGSON3A | |||||
4.IA64 CPU: | |||||
ITANIUM2 | |||||
5.SPARC CPU: | |||||
SPARC | |||||
SPARCV7 | |||||
@@ -149,7 +149,7 @@ $binformat = bin64 if ($data =~ /BINARY_64/); | |||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | ||||
$data =~ /globl\ ([_\.]*)(.*)/; | |||||
$data =~ /globl\s([_\.]*)(.*)/; | |||||
$need_fu = $1; | $need_fu = $1; | ||||
@@ -1,6 +1,14 @@ | |||||
#ifndef CBLAS_H | #ifndef CBLAS_H | ||||
#define CBLAS_H | #define CBLAS_H | ||||
#ifdef __cplusplus | |||||
extern "C" { | |||||
/* Assume C declarations for C++ */ | |||||
#endif /* __cplusplus */ | |||||
#include <stddef.h> | |||||
#include "common.h" | |||||
#define CBLAS_INDEX size_t | #define CBLAS_INDEX size_t | ||||
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | ||||
@@ -270,4 +278,10 @@ void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANS | |||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | ||||
void cblas_xerbla(blasint p, char *rout, char *form, ...); | void cblas_xerbla(blasint p, char *rout, char *form, ...); | ||||
#ifdef __cplusplus | |||||
} | |||||
#endif /* __cplusplus */ | |||||
#endif | #endif |
@@ -39,6 +39,11 @@ | |||||
#ifndef COMMON_H | #ifndef COMMON_H | ||||
#define COMMON_H | #define COMMON_H | ||||
#ifdef __cplusplus | |||||
extern "C" { | |||||
/* Assume C declarations for C++ */ | |||||
#endif /* __cplusplus */ | |||||
#ifndef _GNU_SOURCE | #ifndef _GNU_SOURCE | ||||
#define _GNU_SOURCE | #define _GNU_SOURCE | ||||
#endif | #endif | ||||
@@ -607,4 +612,9 @@ extern int gotoblas_profile; | |||||
#define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) | #define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) | ||||
#endif | #endif | ||||
#ifdef __cplusplus | |||||
} | |||||
#endif /* __cplusplus */ | |||||
#endif | #endif |
@@ -60,4 +60,8 @@ float _Complex BLASFUNC_REF(cdotc) (blasint *, float *, blasint *, float *, | |||||
double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *); | double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *); | ||||
double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *); | double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *); | ||||
void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); | |||||
double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); | |||||
#endif | #endif |
@@ -1302,24 +1302,25 @@ int get_coretype(void){ | |||||
case 13: | case 13: | ||||
return CORE_DUNNINGTON; | return CORE_DUNNINGTON; | ||||
} | } | ||||
break; | |||||
case 2: | |||||
switch (model) { | |||||
case 5: | |||||
//Intel Core (Clarkdale) / Core (Arrandale) | |||||
// Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||||
// Xeon (Clarkdale), 32nm | |||||
return CORE_NEHALEM; | |||||
case 12: | |||||
//Xeon Processor 5600 (Westmere-EP) | |||||
return CORE_NEHALEM; | |||||
} | |||||
break; | |||||
break; | |||||
case 2: | |||||
switch (model) { | |||||
case 5: | |||||
//Intel Core (Clarkdale) / Core (Arrandale) | |||||
// Pentium (Clarkdale) / Pentium Mobile (Arrandale) | |||||
// Xeon (Clarkdale), 32nm | |||||
return CORE_NEHALEM; | |||||
case 12: | |||||
//Xeon Processor 5600 (Westmere-EP) | |||||
return CORE_NEHALEM; | |||||
} | |||||
break; | |||||
} | } | ||||
break; | |||||
case 15: | case 15: | ||||
if (model <= 0x2) return CORE_NORTHWOOD; | |||||
return CORE_PRESCOTT; | |||||
if (model <= 0x2) return CORE_NORTHWOOD; | |||||
else return CORE_PRESCOTT; | |||||
} | } | ||||
} | } | ||||
@@ -6,7 +6,7 @@ COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) | |||||
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | ||||
ifdef SMP | ifdef SMP | ||||
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) | |||||
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | |||||
ifndef NO_AFFINITY | ifndef NO_AFFINITY | ||||
COMMONOBJS += init.$(SUFFIX) | COMMONOBJS += init.$(SUFFIX) | ||||
endif | endif | ||||
@@ -100,6 +100,9 @@ memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h | |||||
blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h | blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h | ||||
$(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | |||||
$(CC) $(CFLAGS) -c $< -o $(@F) | |||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | ||||
$(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
@@ -38,7 +38,7 @@ | |||||
#include <stdio.h> | #include <stdio.h> | ||||
#include <stdlib.h> | #include <stdlib.h> | ||||
#include <sys/mman.h> | |||||
//#include <sys/mman.h> | |||||
#include "common.h" | #include "common.h" | ||||
#ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
@@ -49,6 +49,26 @@ | |||||
int blas_server_avail = 0; | int blas_server_avail = 0; | ||||
void goto_set_num_threads(int num_threads) { | |||||
if (num_threads < 1) num_threads = blas_num_threads; | |||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||||
if (num_threads > blas_num_threads) { | |||||
blas_num_threads = num_threads; | |||||
} | |||||
blas_cpu_number = num_threads; | |||||
omp_set_num_threads(blas_cpu_number); | |||||
} | |||||
void openblas_set_num_threads(int num_threads) { | |||||
goto_set_num_threads(num_threads); | |||||
} | |||||
int blas_thread_init(void){ | int blas_thread_init(void){ | ||||
blas_get_cpu_number(); | blas_get_cpu_number(); | ||||
@@ -172,13 +172,20 @@ static inline int rcount(unsigned long number) { | |||||
return count; | return count; | ||||
} | } | ||||
/*** | |||||
Known issue: The number of CPUs/cores should less | |||||
than sizeof(unsigned long). On 64 bits, the limit | |||||
is 64. On 32 bits, it is 32. | |||||
***/ | |||||
static inline unsigned long get_cpumap(int node) { | static inline unsigned long get_cpumap(int node) { | ||||
int infile; | int infile; | ||||
unsigned long affinity; | unsigned long affinity; | ||||
char name[160]; | char name[160]; | ||||
char cpumap[160]; | |||||
char *p, *dummy; | char *p, *dummy; | ||||
int i=0; | |||||
sprintf(name, CPUMAP_NAME, node); | sprintf(name, CPUMAP_NAME, node); | ||||
infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
@@ -187,13 +194,19 @@ static inline unsigned long get_cpumap(int node) { | |||||
if (infile != -1) { | if (infile != -1) { | ||||
read(infile, name, sizeof(name)); | |||||
read(infile, cpumap, sizeof(cpumap)); | |||||
p = cpumap; | |||||
while (*p != '\n' && i<160){ | |||||
if(*p != ',') { | |||||
name[i++]=*p; | |||||
} | |||||
p++; | |||||
} | |||||
p = name; | p = name; | ||||
while ((*p == '0') || (*p == ',')) p++; | |||||
// while ((*p == '0') || (*p == ',')) p++; | |||||
affinity = strtol(p, &dummy, 16); | |||||
affinity = strtoul(p, &dummy, 16); | |||||
close(infile); | close(infile); | ||||
} | } | ||||
@@ -347,7 +360,13 @@ static void disable_hyperthread(void) { | |||||
unsigned long share; | unsigned long share; | ||||
int cpu; | int cpu; | ||||
common -> avail = (1UL << common -> num_procs) - 1; | |||||
if(common->num_procs > 64){ | |||||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); | |||||
exit(1); | |||||
}else if(common->num_procs == 64){ | |||||
common -> avail = 0xFFFFFFFFFFFFFFFFUL; | |||||
}else | |||||
common -> avail = (1UL << common -> num_procs) - 1; | |||||
#ifdef DEBUG | #ifdef DEBUG | ||||
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | ||||
@@ -376,7 +395,13 @@ static void disable_affinity(void) { | |||||
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | ||||
#endif | #endif | ||||
lprocmask = (1UL << common -> final_num_procs) - 1; | |||||
if(common->final_num_procs > 64){ | |||||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); | |||||
exit(1); | |||||
}else if(common->final_num_procs == 64){ | |||||
lprocmask = 0xFFFFFFFFFFFFFFFFUL; | |||||
}else | |||||
lprocmask = (1UL << common -> final_num_procs) - 1; | |||||
#ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | ||||
@@ -0,0 +1,45 @@ | |||||
/***************************************************************************** | |||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the ISCAS nor the names of its contributors may | |||||
be used to endorse or promote products derived from this software | |||||
without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
**********************************************************************************/ | |||||
#include "common.h" | |||||
#ifdef SMP_SERVER | |||||
#ifdef OS_LINUX | |||||
extern void openblas_set_num_threads(int num_threads) ; | |||||
void NAME(int* num_threads){ | |||||
openblas_set_num_threads(*num_threads); | |||||
} | |||||
#endif | |||||
#endif |
@@ -74,20 +74,21 @@ void gotoblas_profile_quit(void) { | |||||
if (cycles > 0) { | if (cycles > 0) { | ||||
fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); | fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); | ||||
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n"); | |||||
fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n"); | |||||
for (i = 0; i < MAX_PROF_TABLE; i ++) { | for (i = 0; i < MAX_PROF_TABLE; i ++) { | ||||
if (function_profile_table[i].calls) { | if (function_profile_table[i].calls) { | ||||
#ifndef OS_WINDOWS | #ifndef OS_WINDOWS | ||||
fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n", | |||||
fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n", | |||||
#else | #else | ||||
fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n", | |||||
fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n", | |||||
#endif | #endif | ||||
func_table[i], | func_table[i], | ||||
function_profile_table[i].calls, | function_profile_table[i].calls, | ||||
(double)function_profile_table[i].cycles / (double)cycles * 100., | (double)function_profile_table[i].cycles / (double)cycles * 100., | ||||
(double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100., | (double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100., | ||||
(double)function_profile_table[i].area / (double)function_profile_table[i].cycles | |||||
(double)function_profile_table[i].area / (double)function_profile_table[i].cycles, | |||||
function_profile_table[i].cycles | |||||
); | ); | ||||
} | } | ||||
} | } | ||||
@@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME) | |||||
zip : dll | zip : dll | ||||
zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) | zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) | ||||
dll : libgoto2.dll | |||||
dll : ../$(LIBDLLNAME) | |||||
#libgoto2.dll | |||||
dll2 : libgoto2_shared.dll | dll2 : libgoto2_shared.dll | ||||
libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) | |||||
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) | |||||
$(RANLIB) ../$(LIBNAME) | $(RANLIB) ../$(LIBNAME) | ||||
ifeq ($(BINARY32), 1) | ifeq ($(BINARY32), 1) | ||||
$(DLLWRAP) -o $(@F) --def libgoto2.def \ | |||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ | |||||
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | ||||
-lib /machine:i386 /def:libgoto2.def | -lib /machine:i386 /def:libgoto2.def | ||||
else | else | ||||
$(DLLWRAP) -o $(@F) --def libgoto2.def \ | |||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \ | |||||
--entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | --entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) | ||||
-lib /machine:X64 /def:libgoto2.def | -lib /machine:X64 /def:libgoto2.def | ||||
endif | endif | ||||
@@ -84,7 +85,7 @@ libgoto_hpl.def : gensymbol | |||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) | ||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def | $(LIBDYNNAME) : ../$(LIBNAME) osx.def | ||||
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
$(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
symbol.$(SUFFIX) : symbol.S | symbol.$(SUFFIX) : symbol.S | ||||
$(CC) $(CFLAGS) -c -o $(@F) $^ | $(CC) $(CFLAGS) -c -o $(@F) $^ | ||||
@@ -274,6 +274,7 @@ if ($link ne "") { | |||||
&& ($flags !~ /kernel32/) | && ($flags !~ /kernel32/) | ||||
&& ($flags !~ /advapi32/) | && ($flags !~ /advapi32/) | ||||
&& ($flags !~ /shell32/) | && ($flags !~ /shell32/) | ||||
&& ($flags !~ /^\-l$/) | |||||
) { | ) { | ||||
$linker_l .= $flags . " "; | $linker_l .= $flags . " "; | ||||
} | } | ||||
@@ -604,30 +604,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#ifndef POWER | #ifndef POWER | ||||
#define POWER | #define POWER | ||||
#endif | #endif | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | #endif | ||||
#if defined(__i386__) || (__x86_64__) | #if defined(__i386__) || (__x86_64__) | ||||
#include "cpuid_x86.c" | #include "cpuid_x86.c" | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | #endif | ||||
#ifdef __ia64__ | #ifdef __ia64__ | ||||
#include "cpuid_ia64.c" | #include "cpuid_ia64.c" | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | #endif | ||||
#ifdef __alpha | #ifdef __alpha | ||||
#include "cpuid_alpha.c" | #include "cpuid_alpha.c" | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | #endif | ||||
#ifdef POWER | #ifdef POWER | ||||
#include "cpuid_power.c" | #include "cpuid_power.c" | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | #endif | ||||
#ifdef sparc | #ifdef sparc | ||||
#include "cpuid_sparc.c" | #include "cpuid_sparc.c" | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | #endif | ||||
#ifdef __mips__ | #ifdef __mips__ | ||||
#include "cpuid_mips.c" | #include "cpuid_mips.c" | ||||
#define OPENBLAS_SUPPORTED | |||||
#endif | |||||
#ifndef OPENBLAS_SUPPORTED | |||||
#error "This arch/CPU is not supported by OpenBLAS." | |||||
#endif | #endif | ||||
#else | #else | ||||
@@ -30,6 +30,10 @@ int main(int argc, char **argv) { | |||||
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); | printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); | ||||
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); | printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); | ||||
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); | printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); | ||||
#ifdef USE64BITINT | |||||
printf("#define USE64BITINT\n"); | |||||
#endif | |||||
} | } | ||||
return 0; | return 0; | ||||
@@ -85,7 +85,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||||
//In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
if (incx == 0 || incy == 0) | if (incx == 0 || incy == 0) | ||||
nthreads = 1; | nthreads = 1; | ||||
//Temporarily walk around the low performance issue with small imput size & multithreads. | |||||
if (n <= 10000) | |||||
nthreads = 1; | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | #endif | ||||
@@ -49,6 +49,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ | |||||
BLASLONG n = *N; | BLASLONG n = *N; | ||||
BLASLONG incx = *INCX; | BLASLONG incx = *INCX; | ||||
BLASLONG incy = *INCY; | BLASLONG incy = *INCY; | ||||
double ret = 0.0; | |||||
PRINT_DEBUG_NAME; | PRINT_DEBUG_NAME; | ||||
@@ -61,19 +62,21 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ | |||||
if (incx < 0) x -= (n - 1) * incx; | if (incx < 0) x -= (n - 1) * incx; | ||||
if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
return DSDOT_K(n, x, incx, y, incy); | |||||
ret=DSDOT_K(n, x, incx, y, incy); | |||||
FUNCTION_PROFILE_END(1, n, n); | FUNCTION_PROFILE_END(1, n, n); | ||||
IDEBUG_END; | IDEBUG_END; | ||||
return 0; | |||||
return ret; | |||||
} | } | ||||
#else | #else | ||||
double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ | double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ | ||||
double ret = 0.0; | |||||
PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
@@ -86,13 +89,13 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ | |||||
if (incx < 0) x -= (n - 1) * incx; | if (incx < 0) x -= (n - 1) * incx; | ||||
if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
return DSDOT_K(n, x, incx, y, incy); | |||||
ret=DSDOT_K(n, x, incx, y, incy); | |||||
FUNCTION_PROFILE_END(1, n, n); | FUNCTION_PROFILE_END(1, n, n); | ||||
IDEBUG_END; | IDEBUG_END; | ||||
return 0; | |||||
return ret; | |||||
} | } | ||||
@@ -7,6 +7,12 @@ | |||||
#define GAMSQ 16777216.e0 | #define GAMSQ 16777216.e0 | ||||
#define RGAMSQ 5.9604645e-8 | #define RGAMSQ 5.9604645e-8 | ||||
#ifdef DOUBLE | |||||
#define ABS(x) fabs(x) | |||||
#else | |||||
#define ABS(x) fabsf(x) | |||||
#endif | |||||
#ifndef CBLAS | #ifndef CBLAS | ||||
void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){ | void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){ | ||||
@@ -47,7 +53,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||||
dq2 = dp2 * dy1; | dq2 = dp2 * dy1; | ||||
dq1 = dp1 * *dx1; | dq1 = dp1 * *dx1; | ||||
if (! (abs(dq1) > abs(dq2))) goto L40; | |||||
if (! (ABS(dq1) > ABS(dq2))) goto L40; | |||||
dh21 = -(dy1) / *dx1; | dh21 = -(dy1) / *dx1; | ||||
dh12 = dp2 / dp1; | dh12 = dp2 / dp1; | ||||
@@ -140,7 +146,7 @@ L150: | |||||
goto L130; | goto L130; | ||||
L160: | L160: | ||||
if (! (abs(*dd2) <= RGAMSQ)) { | |||||
if (! (ABS(*dd2) <= RGAMSQ)) { | |||||
goto L190; | goto L190; | ||||
} | } | ||||
if (*dd2 == ZERO) { | if (*dd2 == ZERO) { | ||||
@@ -157,7 +163,7 @@ L180: | |||||
goto L160; | goto L160; | ||||
L190: | L190: | ||||
if (! (abs(*dd2) >= GAMSQ)) { | |||||
if (! (ABS(*dd2) >= GAMSQ)) { | |||||
goto L220; | goto L220; | ||||
} | } | ||||
igo = 3; | igo = 3; | ||||
@@ -53,6 +53,11 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) | |||||
CCOMMON_OPT += -DTS=$(TSUFFIX) | CCOMMON_OPT += -DTS=$(TSUFFIX) | ||||
endif | endif | ||||
KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h | |||||
ifneq ($(NO_LAPACK), 1) | |||||
KERNEL_INTERFACE += ../common_lapack.h | |||||
endif | |||||
ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
COMMONOBJS += cpuid.$(SUFFIX) | COMMONOBJS += cpuid.$(SUFFIX) | ||||
endif | endif | ||||
@@ -88,9 +93,10 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h | |||||
setparam$(TSUFFIX).c : setparam-ref.c | setparam$(TSUFFIX).c : setparam-ref.c | ||||
sed 's/TS/$(TSUFFIX)/g' $< > $(@F) | sed 's/TS/$(TSUFFIX)/g' $< > $(@F) | ||||
kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h | |||||
kernel$(TSUFFIX).h : $(KERNEL_INTERFACE) | |||||
sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) | sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) | ||||
cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S | cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S | ||||
$(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
@@ -112,10 +118,10 @@ lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) | |||||
cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S | cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S | ||||
$(CC) -c $(PFLAGS) $< -o $(@F) | $(CC) -c $(PFLAGS) $< -o $(@F) | ||||
ifdef DYNAMIC_ARCH | |||||
#ifdef DYNAMIC_ARCH | |||||
clean :: | clean :: | ||||
@rm -f setparam_*.c kernel_*.h setparam.h kernel.h | @rm -f setparam_*.c kernel_*.h setparam.h kernel.h | ||||
endif | |||||
#endif | |||||
include $(TOPDIR)/Makefile.tail | include $(TOPDIR)/Makefile.tail |
@@ -668,7 +668,7 @@ $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | ||||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | $(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | ||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ | |||||
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | ||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | ||||
@@ -300,7 +300,11 @@ | |||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
j $31 | |||||
ADD s1, s1, s2 | ADD s1, s1, s2 | ||||
#ifdef DSDOT | |||||
cvt.d.s s1, s1 | |||||
#endif | |||||
j $31 | |||||
NOP | |||||
EPILOGUE | EPILOGUE |
@@ -101,7 +101,11 @@ gotoblas_t TABLE_NAME = { | |||||
#endif | #endif | ||||
ssymm_outcopyTS, ssymm_oltcopyTS, | ssymm_outcopyTS, ssymm_oltcopyTS, | ||||
#ifndef NO_LAPACK | |||||
sneg_tcopyTS, slaswp_ncopyTS, | sneg_tcopyTS, slaswp_ncopyTS, | ||||
#else | |||||
NULL,NULL, | |||||
#endif | |||||
0, 0, 0, | 0, 0, 0, | ||||
DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), | DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), | ||||
@@ -147,7 +151,11 @@ gotoblas_t TABLE_NAME = { | |||||
#endif | #endif | ||||
dsymm_outcopyTS, dsymm_oltcopyTS, | dsymm_outcopyTS, dsymm_oltcopyTS, | ||||
#ifndef NO_LAPACK | |||||
dneg_tcopyTS, dlaswp_ncopyTS, | dneg_tcopyTS, dlaswp_ncopyTS, | ||||
#else | |||||
NULL, NULL, | |||||
#endif | |||||
#ifdef EXPRECISION | #ifdef EXPRECISION | ||||
@@ -195,7 +203,11 @@ gotoblas_t TABLE_NAME = { | |||||
#endif | #endif | ||||
qsymm_outcopyTS, qsymm_oltcopyTS, | qsymm_outcopyTS, qsymm_oltcopyTS, | ||||
#ifndef NO_LAPACK | |||||
qneg_tcopyTS, qlaswp_ncopyTS, | qneg_tcopyTS, qlaswp_ncopyTS, | ||||
#else | |||||
NULL, NULL, | |||||
#endif | |||||
#endif | #endif | ||||
@@ -286,7 +298,11 @@ gotoblas_t TABLE_NAME = { | |||||
chemm3m_oucopyrTS, chemm3m_olcopyrTS, | chemm3m_oucopyrTS, chemm3m_olcopyrTS, | ||||
chemm3m_oucopyiTS, chemm3m_olcopyiTS, | chemm3m_oucopyiTS, chemm3m_olcopyiTS, | ||||
#ifndef NO_LAPACK | |||||
cneg_tcopyTS, claswp_ncopyTS, | cneg_tcopyTS, claswp_ncopyTS, | ||||
#else | |||||
NULL, NULL, | |||||
#endif | |||||
0, 0, 0, | 0, 0, 0, | ||||
ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N), | ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N), | ||||
@@ -375,7 +391,11 @@ gotoblas_t TABLE_NAME = { | |||||
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, | zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, | ||||
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, | zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, | ||||
#ifndef NO_LAPACK | |||||
zneg_tcopyTS, zlaswp_ncopyTS, | zneg_tcopyTS, zlaswp_ncopyTS, | ||||
#else | |||||
NULL, NULL, | |||||
#endif | |||||
#ifdef EXPRECISION | #ifdef EXPRECISION | ||||
@@ -466,7 +486,11 @@ gotoblas_t TABLE_NAME = { | |||||
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, | xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, | ||||
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, | xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, | ||||
#ifndef NO_LAPACK | |||||
xneg_tcopyTS, xlaswp_ncopyTS, | xneg_tcopyTS, xlaswp_ncopyTS, | ||||
#else | |||||
NULL, NULL, | |||||
#endif | |||||
#endif | #endif | ||||
@@ -1541,5 +1541,8 @@ | |||||
popl %ebx | popl %ebx | ||||
popl %esi | popl %esi | ||||
popl %edi | popl %edi | ||||
/*remove the hidden return value address from the stack.*/ | |||||
popl %ecx | |||||
xchgl %ecx, 0(%esp) | |||||
ret | ret | ||||
EPILOGUE | EPILOGUE |
@@ -1286,6 +1286,10 @@ | |||||
haddps %xmm0, %xmm0 | haddps %xmm0, %xmm0 | ||||
#endif | #endif | ||||
#ifdef DSDOT | |||||
cvtss2sd %xmm0, %xmm0 | |||||
#endif | |||||
RESTOREREGISTERS | RESTOREREGISTERS | ||||
ret | ret | ||||
@@ -544,7 +544,7 @@ | |||||
jg .L11 | jg .L11 | ||||
#if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
addq $1, KK | |||||
addq $4, KK | |||||
#endif | #endif | ||||
leaq (C, LDC, 4), C | leaq (C, LDC, 4), C | ||||
@@ -594,7 +594,7 @@ | |||||
jg .L11 | jg .L11 | ||||
#if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
addq $1, KK | |||||
addq $4, KK | |||||
#endif | #endif | ||||
leaq (C, LDC, 4), C | leaq (C, LDC, 4), C | ||||
@@ -0,0 +1,21 @@ | |||||
/*This is only for "make install" target.*/ | |||||
#ifdef NEEDBUNDERSCORE | |||||
#define BLASFUNC(FUNC) FUNC##_ | |||||
#else | |||||
#define BLASFUNC(FUNC) FUNC | |||||
#endif | |||||
#if defined(OS_WINDOWS) && defined(__64BIT__) | |||||
typedef long long BLASLONG; | |||||
typedef unsigned long long BLASULONG; | |||||
#else | |||||
typedef long BLASLONG; | |||||
typedef unsigned long BLASULONG; | |||||
#endif | |||||
#ifdef USE64BITINT | |||||
typedef BLASLONG blasint; | |||||
#else | |||||
typedef int blasint; | |||||
#endif |
@@ -128,6 +128,8 @@ CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) | |||||
ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) | ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) | ||||
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | ||||
ifneq ($(NO_LAPACK), 1) | |||||
SBLASOBJS += \ | SBLASOBJS += \ | ||||
sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \ | sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \ | ||||
spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \ | spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \ | ||||
@@ -160,6 +162,7 @@ XBLASOBJS += | |||||
xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ | xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ | ||||
xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ | xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ | ||||
endif | |||||
include $(TOPDIR)/Makefile.tail | include $(TOPDIR)/Makefile.tail | ||||
@@ -5,12 +5,12 @@ include $(TOPDIR)/Makefile.system | |||||
TARGET=openblas_utest | TARGET=openblas_utest | ||||
CUNIT_LIB=/usr/local/lib/libcunit.a | CUNIT_LIB=/usr/local/lib/libcunit.a | ||||
OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o | |||||
OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o | |||||
all : run_test | all : run_test | ||||
$(TARGET): $(OBJS) | $(TARGET): $(OBJS) | ||||
$(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) | |||||
$(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) | |||||
run_test: $(TARGET) | run_test: $(TARGET) | ||||
./$(TARGET) | ./$(TARGET) | ||||
@@ -57,4 +57,8 @@ void test_caxpy_inc_0(void); | |||||
void test_zdotu_n_1(void); | void test_zdotu_n_1(void); | ||||
void test_zdotu_offset_1(void); | void test_zdotu_offset_1(void); | ||||
void test_drotmg(void); | |||||
void test_dsdot_n_1(void); | |||||
#endif | #endif |
@@ -54,7 +54,10 @@ CU_TestInfo test_level1[]={ | |||||
{"Testing zdotu with n == 1",test_zdotu_n_1}, | {"Testing zdotu with n == 1",test_zdotu_n_1}, | ||||
{"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, | {"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, | ||||
{"Testing drotmg",test_drotmg}, | |||||
{"Testing dsdot with n == 1",test_dsdot_n_1}, | |||||
CU_TEST_INFO_NULL, | CU_TEST_INFO_NULL, | ||||
}; | }; | ||||
@@ -0,0 +1,50 @@ | |||||
/***************************************************************************** | |||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the ISCAS nor the names of its contributors may | |||||
be used to endorse or promote products derived from this software | |||||
without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
**********************************************************************************/ | |||||
#include "common_utest.h" | |||||
void test_dsdot_n_1() | |||||
{ | |||||
float x= 0.172555164; | |||||
float y= -0.0138700781; | |||||
int incx=1; | |||||
int incy=1; | |||||
int n=1; | |||||
double res1=0.0f, res2=0.0f; | |||||
res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); | |||||
res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy); | |||||
CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS); | |||||
} |
@@ -0,0 +1,60 @@ | |||||
/***************************************************************************** | |||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the ISCAS nor the names of its contributors may | |||||
be used to endorse or promote products derived from this software | |||||
without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
**********************************************************************************/ | |||||
#include "common_utest.h" | |||||
void test_drotmg() | |||||
{ | |||||
double te_d1, tr_d1; | |||||
double te_d2, tr_d2; | |||||
double te_x1, tr_x1; | |||||
double te_y1, tr_y1; | |||||
double te_param[5],tr_param[5]; | |||||
int i=0; | |||||
te_d1= tr_d1=0.21149573940783739; | |||||
te_d2= tr_d2=0.046892057172954082; | |||||
te_x1= tr_x1=-0.42272687517106533; | |||||
te_y1= tr_y1=0.42211309121921659; | |||||
//OpenBLAS | |||||
BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); | |||||
//reference | |||||
BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param); | |||||
CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS); | |||||
CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS); | |||||
CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS); | |||||
CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS); | |||||
for(i=0; i<5; i++){ | |||||
CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS); | |||||
} | |||||
} |