@@ -219,10 +219,10 @@ prof_lapack : lapack_prebuild | |||
lapack_prebuild : | |||
ifndef NOFORTRAN | |||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
@@ -23,8 +23,8 @@ install : lib.grd | |||
#for inc | |||
@echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@awk '{print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h | |||
@@ -222,6 +222,11 @@ endif | |||
endif | |||
endif | |||
# ifeq logical or | |||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) | |||
OS_WINDOWS=1 | |||
endif | |||
ifdef QUAD_PRECISION | |||
CCOMMON_OPT += -DQUAD_PRECISION | |||
NO_EXPRECISION = 1 | |||
@@ -477,10 +482,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT | |||
FCOMMON_OPT += -Wall | |||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
ifneq ($(NO_LAPACK), 1) | |||
ifneq ($(C_COMPILER), LSB) | |||
EXTRALIB += -lgfortran | |||
endif | |||
endif | |||
ifdef NO_BINARY_MODE | |||
ifeq ($(ARCH), mips64) | |||
ifdef BINARY64 | |||
@@ -861,11 +864,18 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) | |||
#MAKEOVERRIDES = | |||
#For LAPACK Fortran codes. | |||
LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) | |||
LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) | |||
LAPACK_CFLAGS = $(CFLAGS) | |||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | |||
ifdef INTERFACE64 | |||
LAPACK_CFLAGS += -DLAPACK_ILP64 | |||
endif | |||
ifdef OS_WINDOWS | |||
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS | |||
endif | |||
ifeq ($(C_COMPILER), LSB) | |||
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE | |||
endif | |||
@@ -606,7 +606,8 @@ clean :: | |||
@if test -d $(ARCH); then \ | |||
(cd $(ARCH) && $(MAKE) clean) \ | |||
fi | |||
@rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \ | |||
@find . -name '*.o' | xargs rm -rf | |||
@rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \ | |||
*.csx *.is *~ *.exe *.flame *.pdb *.dwf \ | |||
gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \ | |||
*.pc *.pcl *.def *.i *.prof linktest.c \ | |||
@@ -441,9 +441,10 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
if (blas_server_avail){ | |||
SetEvent(pool.killed); | |||
printf("blas_num_threads=%d\n", blas_num_threads); | |||
for(i = 0; i < blas_num_threads - 1; i++){ | |||
WaitForSingleObject(blas_threads[i], INFINITE); | |||
WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
TerminateThread(blas_threads[i],0); | |||
} | |||
blas_server_avail = 0; | |||
@@ -363,7 +363,7 @@ static void *alloc_mmap(void *address){ | |||
#define BENCH_ITERATION 4 | |||
#define SCALING 2 | |||
static inline BLASULONG run_bench(BLASULONG address, long size) { | |||
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { | |||
BLASULONG original, *p; | |||
BLASULONG start, stop, min; | |||
@@ -450,12 +450,12 @@ static void *alloc_mmap(void *address){ | |||
current = (SCALING - 1) * BUFFER_SIZE; | |||
while(current > 0) { | |||
*(long *)start = (long)start + PAGESIZE; | |||
*(BLASLONG *)start = (BLASLONG)start + PAGESIZE; | |||
start += PAGESIZE; | |||
current -= PAGESIZE; | |||
} | |||
*(long *)(start - PAGESIZE) = (BLASULONG)map_address; | |||
*(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; | |||
start = (BLASULONG)map_address; | |||
@@ -1170,7 +1170,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, | |||
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) | |||
long size; | |||
size_t size; | |||
BLASULONG buffer; | |||
size = BUFFER_SIZE - PAGESIZE; | |||
@@ -111,7 +111,7 @@ libgoto_hpl.def : gensymbol | |||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) | |||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def | |||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
symbol.$(SUFFIX) : symbol.S | |||
$(CC) $(CFLAGS) -c -o $(@F) $^ | |||
@@ -124,14 +124,17 @@ ifeq ($(OSNAME), Linux) | |||
so : ../$(LIBSONAME) | |||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c | |||
ifneq ($(C_COMPILER), LSB) | |||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | |||
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) | |||
ifneq ($(C_COMPILER), LSB) | |||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
else | |||
#Use FC on LSB | |||
$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
#for LSB | |||
env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ | |||
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) | |||
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
endif | |||
rm -f linktest | |||
@@ -60,7 +60,6 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT * | |||
}; | |||
#endif | |||
extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info); | |||
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
@@ -133,18 +132,6 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In | |||
if (args.nthreads == 1) { | |||
#endif | |||
#if DOUBLE | |||
// double trtri_U single thread error | |||
// call dtrtri from lapack for a walk around. | |||
if(uplo==0){ | |||
BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info); | |||
#ifndef PPC440 | |||
blas_memory_free(buffer); | |||
#endif | |||
return 0; | |||
} | |||
#endif | |||
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); | |||
#ifdef SMP | |||
@@ -103,7 +103,7 @@ | |||
vmovups -10*SIZE(AO,%rax,8), %xmm6 | |||
vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 | |||
vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_8x2 | |||
@@ -265,7 +265,7 @@ | |||
vmovups -14*SIZE(AO,%rax,4), %xmm0 | |||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 | |||
vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -338,7 +338,7 @@ | |||
vmovups -16*SIZE(AO,%rax,2), %xmm0 | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -378,7 +378,7 @@ | |||
vmovups -16*SIZE(BO,%rax,2), %xmm1 | |||
vmovddup -16*SIZE(AO,%rax,1), %xmm0 | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_1x2 | |||
@@ -411,7 +411,7 @@ | |||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 | |||
vmovups -10*SIZE(AO,%rax,8), %xmm0 | |||
vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_8x1 | |||
@@ -510,7 +510,7 @@ | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
vmovups -14*SIZE(AO,%rax,4), %xmm0 | |||
vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -560,7 +560,7 @@ | |||
vmovddup -16*SIZE(BO,%rax,1), %xmm1 | |||
vmovups -16*SIZE(AO,%rax,2), %xmm0 | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -592,7 +592,7 @@ | |||
vmovsd -16*SIZE(BO,%rax,1), %xmm1 | |||
vmovsd -16*SIZE(AO,%rax,1), %xmm0 | |||
vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_1x1 | |||
@@ -103,7 +103,7 @@ | |||
vmovups -10*SIZE(AO,%rax,8), %xmm6 | |||
vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 | |||
vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_8x2 | |||
@@ -177,7 +177,7 @@ | |||
vmovups -14*SIZE(AO,%rax,4), %xmm0 | |||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 | |||
vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -226,7 +226,7 @@ | |||
vmovups -16*SIZE(AO,%rax,2), %xmm0 | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -262,7 +262,7 @@ | |||
vmovups -16*SIZE(BO,%rax,2), %xmm1 | |||
vmovddup -16*SIZE(AO,%rax,1), %xmm0 | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_1x2 | |||
@@ -306,7 +306,7 @@ | |||
vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 | |||
vmovups -10*SIZE(AO,%rax,8), %xmm0 | |||
vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_8x1 | |||
@@ -347,7 +347,7 @@ | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
vmovups -14*SIZE(AO,%rax,4), %xmm0 | |||
vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -377,7 +377,7 @@ | |||
vmovddup -16*SIZE(BO,%rax,1), %xmm1 | |||
vmovups -16*SIZE(AO,%rax,2), %xmm0 | |||
vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
@@ -402,7 +402,7 @@ | |||
vmovsd -16*SIZE(BO,%rax,1), %xmm1 | |||
vmovsd -16*SIZE(AO,%rax,1), %xmm0 | |||
vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 | |||
addq $SIZE, %rax | |||
addq $ SIZE, %rax | |||
.endm | |||
.macro SOLVE_1x1 | |||
@@ -45,7 +45,11 @@ extern "C" { | |||
#ifndef lapack_int | |||
#if defined(LAPACK_ILP64) | |||
#if defined(OPENBLAS_OS_WINDOWS) | |||
#define lapack_int long long | |||
#else | |||
#define lapack_int long | |||
#endif | |||
#else | |||
#define lapack_int int | |||
#endif | |||
@@ -67,14 +67,14 @@ double sqrt(double); | |||
#undef GETRF_FACTOR | |||
#define GETRF_FACTOR 1.00 | |||
static inline long FORMULA1(long M, long N, long IS, long BK, long T) { | |||
static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { | |||
double m = (double)(M - IS - BK); | |||
double n = (double)(N - IS - BK); | |||
double b = (double)BK; | |||
double a = (double)T; | |||
return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); | |||
return (BLASLONG)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); | |||
} | |||
@@ -111,7 +111,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra | |||
if (args -> a == NULL) { | |||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); | |||
sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
} else { | |||
sb = (FLOAT *)args -> a; | |||
} | |||
@@ -221,7 +221,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * | |||
if (args -> a == NULL) { | |||
TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); | |||
sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
} else { | |||
sb = (FLOAT *)args -> a; | |||
} | |||
@@ -448,7 +448,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); | |||
sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
is = 0; | |||
num_cpu = 0; | |||
@@ -685,7 +685,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
if (width > n - init_bk) width = n - init_bk; | |||
if (width < init_bk) { | |||
long temp; | |||
BLASLONG temp; | |||
temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); | |||
temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); | |||
@@ -708,7 +708,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
is = 0; | |||
num_cpu = 0; | |||
sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
sbb = (FLOAT *)((((BLASULONG)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
while (is < mn) { | |||
@@ -178,7 +178,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
return info; | |||
} | |||
sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
info = 0; | |||
@@ -82,7 +82,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
return info; | |||
} | |||
sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
info = 0; | |||
@@ -185,7 +185,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||
buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); | |||
for (i = 1; i < DIVIDE_RATE; i++) { | |||
buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; | |||
} | |||
@@ -13,7 +13,6 @@ ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_sing | |||
XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX) | |||
DBLASOBJS += dtrtri_lapack.$(SUFFIX) | |||
ifdef SMP | |||
SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX) | |||
@@ -54,9 +53,6 @@ dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c | |||
dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) | |||
dtrtri_lapack.$(SUFFIX) : dtrtri_lapack.f | |||
$(FC) -c $(FFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) | |||
dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) | |||
@@ -1,242 +0,0 @@ | |||
*> \brief \b DTRTRI | |||
* | |||
* =========== DOCUMENTATION =========== | |||
* | |||
* Online html documentation available at | |||
* http://www.netlib.org/lapack/explore-html/ | |||
* | |||
*> \htmlonly | |||
*> Download DTRTRI + dependencies | |||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dtrtri.f"> | |||
*> [TGZ]</a> | |||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dtrtri.f"> | |||
*> [ZIP]</a> | |||
*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dtrtri.f"> | |||
*> [TXT]</a> | |||
*> \endhtmlonly | |||
* | |||
* Definition: | |||
* =========== | |||
* | |||
* SUBROUTINE DTRTRI( UPLO, DIAG, N, A, LDA, INFO ) | |||
* | |||
* .. Scalar Arguments .. | |||
* CHARACTER DIAG, UPLO | |||
* INTEGER INFO, LDA, N | |||
* .. | |||
* .. Array Arguments .. | |||
* DOUBLE PRECISION A( LDA, * ) | |||
* .. | |||
* | |||
* | |||
*> \par Purpose: | |||
* ============= | |||
*> | |||
*> \verbatim | |||
*> | |||
*> DTRTRI computes the inverse of a real upper or lower triangular | |||
*> matrix A. | |||
*> | |||
*> This is the Level 3 BLAS version of the algorithm. | |||
*> \endverbatim | |||
* | |||
* Arguments: | |||
* ========== | |||
* | |||
*> \param[in] UPLO | |||
*> \verbatim | |||
*> UPLO is CHARACTER*1 | |||
*> = 'U': A is upper triangular; | |||
*> = 'L': A is lower triangular. | |||
*> \endverbatim | |||
*> | |||
*> \param[in] DIAG | |||
*> \verbatim | |||
*> DIAG is CHARACTER*1 | |||
*> = 'N': A is non-unit triangular; | |||
*> = 'U': A is unit triangular. | |||
*> \endverbatim | |||
*> | |||
*> \param[in] N | |||
*> \verbatim | |||
*> N is INTEGER | |||
*> The order of the matrix A. N >= 0. | |||
*> \endverbatim | |||
*> | |||
*> \param[in,out] A | |||
*> \verbatim | |||
*> A is DOUBLE PRECISION array, dimension (LDA,N) | |||
*> On entry, the triangular matrix A. If UPLO = 'U', the | |||
*> leading N-by-N upper triangular part of the array A contains | |||
*> the upper triangular matrix, and the strictly lower | |||
*> triangular part of A is not referenced. If UPLO = 'L', the | |||
*> leading N-by-N lower triangular part of the array A contains | |||
*> the lower triangular matrix, and the strictly upper | |||
*> triangular part of A is not referenced. If DIAG = 'U', the | |||
*> diagonal elements of A are also not referenced and are | |||
*> assumed to be 1. | |||
*> On exit, the (triangular) inverse of the original matrix, in | |||
*> the same storage format. | |||
*> \endverbatim | |||
*> | |||
*> \param[in] LDA | |||
*> \verbatim | |||
*> LDA is INTEGER | |||
*> The leading dimension of the array A. LDA >= max(1,N). | |||
*> \endverbatim | |||
*> | |||
*> \param[out] INFO | |||
*> \verbatim | |||
*> INFO is INTEGER | |||
*> = 0: successful exit | |||
*> < 0: if INFO = -i, the i-th argument had an illegal value | |||
*> > 0: if INFO = i, A(i,i) is exactly zero. The triangular | |||
*> matrix is singular and its inverse can not be computed. | |||
*> \endverbatim | |||
* | |||
* Authors: | |||
* ======== | |||
* | |||
*> \author Univ. of Tennessee | |||
*> \author Univ. of California Berkeley | |||
*> \author Univ. of Colorado Denver | |||
*> \author NAG Ltd. | |||
* | |||
*> \date November 2011 | |||
* | |||
*> \ingroup doubleOTHERcomputational | |||
* | |||
* ===================================================================== | |||
SUBROUTINE DTRTRILAPACK( UPLO, DIAG, N, A, LDA, INFO ) | |||
* | |||
* -- LAPACK computational routine (version 3.4.0) -- | |||
* -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
* November 2011 | |||
* | |||
* .. Scalar Arguments .. | |||
CHARACTER DIAG, UPLO | |||
INTEGER INFO, LDA, N | |||
* .. | |||
* .. Array Arguments .. | |||
DOUBLE PRECISION A( LDA, * ) | |||
* .. | |||
* | |||
* ===================================================================== | |||
* | |||
* .. Parameters .. | |||
DOUBLE PRECISION ONE, ZERO | |||
PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) | |||
* .. | |||
* .. Local Scalars .. | |||
LOGICAL NOUNIT, UPPER | |||
INTEGER J, JB, NB, NN | |||
* .. | |||
* .. External Functions .. | |||
LOGICAL LSAME | |||
INTEGER ILAENV | |||
EXTERNAL LSAME, ILAENV | |||
* .. | |||
* .. External Subroutines .. | |||
EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA | |||
* .. | |||
* .. Intrinsic Functions .. | |||
INTRINSIC MAX, MIN | |||
* .. | |||
* .. Executable Statements .. | |||
* | |||
* Test the input parameters. | |||
* | |||
INFO = 0 | |||
UPPER = LSAME( UPLO, 'U' ) | |||
NOUNIT = LSAME( DIAG, 'N' ) | |||
IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN | |||
INFO = -1 | |||
ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN | |||
INFO = -2 | |||
ELSE IF( N.LT.0 ) THEN | |||
INFO = -3 | |||
ELSE IF( LDA.LT.MAX( 1, N ) ) THEN | |||
INFO = -5 | |||
END IF | |||
IF( INFO.NE.0 ) THEN | |||
CALL XERBLA( 'DTRTRI', -INFO ) | |||
RETURN | |||
END IF | |||
* | |||
* Quick return if possible | |||
* | |||
IF( N.EQ.0 ) | |||
$ RETURN | |||
* | |||
* Check for singularity if non-unit. | |||
* | |||
IF( NOUNIT ) THEN | |||
DO 10 INFO = 1, N | |||
IF( A( INFO, INFO ).EQ.ZERO ) | |||
$ RETURN | |||
10 CONTINUE | |||
INFO = 0 | |||
END IF | |||
* | |||
* Determine the block size for this environment. | |||
* | |||
NB = ILAENV( 1, 'DTRTRI', UPLO // DIAG, N, -1, -1, -1 ) | |||
IF( NB.LE.1 .OR. NB.GE.N ) THEN | |||
* | |||
* Use unblocked code | |||
* | |||
CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO ) | |||
ELSE | |||
* | |||
* Use blocked code | |||
* | |||
IF( UPPER ) THEN | |||
* | |||
* Compute inverse of upper triangular matrix | |||
* | |||
DO 20 J = 1, N, NB | |||
JB = MIN( NB, N-J+1 ) | |||
* | |||
* Compute rows 1:j-1 of current block column | |||
* | |||
CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, | |||
$ JB, ONE, A, LDA, A( 1, J ), LDA ) | |||
CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, | |||
$ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) | |||
* | |||
* Compute inverse of current diagonal block | |||
* | |||
CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) | |||
20 CONTINUE | |||
ELSE | |||
* | |||
* Compute inverse of lower triangular matrix | |||
* | |||
NN = ( ( N-1 ) / NB )*NB + 1 | |||
DO 30 J = NN, 1, -NB | |||
JB = MIN( NB, N-J+1 ) | |||
IF( J+JB.LE.N ) THEN | |||
* | |||
* Compute rows j+jb:n of current block column | |||
* | |||
CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG, | |||
$ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, | |||
$ A( J+JB, J ), LDA ) | |||
CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG, | |||
$ N-J-JB+1, JB, -ONE, A( J, J ), LDA, | |||
$ A( J+JB, J ), LDA ) | |||
END IF | |||
* | |||
* Compute inverse of current diagonal block | |||
* | |||
CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) | |||
30 CONTINUE | |||
END IF | |||
END IF | |||
* | |||
RETURN | |||
* | |||
* End of DTRTRI | |||
* | |||
END |
@@ -127,8 +127,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
if (ls == i + bk) { | |||
NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
GEMM_BETA(min_i, bk, 0, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
@@ -171,8 +177,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
min_i = i - is; | |||
if (min_i > GEMM_P) min_i = GEMM_P; | |||
NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
//NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); | |||
GEMM_BETA(min_i, bk, 0, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
#endif | |||
NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); | |||
TRSM_KERNEL_RN(min_i, bk, bk, dm1, | |||
#ifdef COMPLEX | |||
ZERO, | |||
@@ -1,8 +1,8 @@ | |||
/*This is only for "make install" target.*/ | |||
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) | |||
#define WINDOWS_ABI | |||
#define OS_WINDOWS | |||
#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX) | |||
#define OPENBLAS_WINDOWS_ABI | |||
#define OPENBLAS_OS_WINDOWS | |||
#ifdef DOUBLE | |||
#define DOUBLE_DEFINED DOUBLE | |||
@@ -10,23 +10,23 @@ | |||
#endif | |||
#endif | |||
#ifdef NEEDBUNDERSCORE | |||
#ifdef OPENBLAS_NEEDBUNDERSCORE | |||
#define BLASFUNC(FUNC) FUNC##_ | |||
#else | |||
#define BLASFUNC(FUNC) FUNC | |||
#endif | |||
#ifdef QUAD_PRECISION | |||
#ifdef OPENBLAS_QUAD_PRECISION | |||
typedef struct { | |||
unsigned long x[2]; | |||
} xdouble; | |||
#elif defined EXPRECISION | |||
#elif defined OPENBLAS_EXPRECISION | |||
#define xdouble long double | |||
#else | |||
#define xdouble double | |||
#endif | |||
#if defined(OS_WINDOWS) && defined(__64BIT__) | |||
#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__) | |||
typedef long long BLASLONG; | |||
typedef unsigned long long BLASULONG; | |||
#else | |||
@@ -34,7 +34,7 @@ typedef long BLASLONG; | |||
typedef unsigned long BLASULONG; | |||
#endif | |||
#ifdef USE64BITINT | |||
#ifdef OPENBLAS_USE64BITINT | |||
typedef BLASLONG blasint; | |||
#else | |||
typedef int blasint; | |||