Fixed c/zgemm, zgemv computational error of haswell, piledriver, bullldozer, and barcelona on Windows. Merge branch 'develop' of https://github.com/wernsaar/OpenBLAS into wernsaar-develop Conflicts: kernel/Makefile.L1 kernel/x86_64/KERNEL param.htags/v0.2.10.rc1^2
@@ -23,7 +23,7 @@ endif | |||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
.PHONY : all libs netlib test ctest shared install | |||
.NOTPARALLEL : all libs prof lapack-test install | |||
.NOTPARALLEL : all libs prof lapack-test install blas-test | |||
all :: libs netlib tests shared | |||
@echo | |||
@@ -282,6 +282,11 @@ lapack-test : | |||
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) | |||
blas-test: | |||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) | |||
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) | |||
dummy : | |||
@@ -687,15 +687,27 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL | |||
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ | |||
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
ifdef DSDOTKERNEL | |||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ | |||
$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ | |||
else | |||
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ | |||
$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ | |||
endif | |||
$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@ | |||
@@ -119,9 +119,15 @@ XCOPYKERNEL = zcopy.S | |||
endif | |||
ifndef SDOTKERNEL | |||
SDOTKERNEL = ../arm/dot.c | |||
SDOTKERNEL = dot_sse.S | |||
endif | |||
ifndef DSDOTKERNEL | |||
DSDOTKERNEL = ../arm/dot.c | |||
endif | |||
ifndef DDOTKERNEL | |||
DDOTKERNEL = dot_sse2.S | |||
endif | |||
@@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S | |||
SGEMVTKERNEL = sgemv_t.S | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
@@ -1,5 +1,5 @@ | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
@@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S | |||
SGEMVTKERNEL = sgemv_t.S | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
DGEMVTKERNEL = dgemv_t_bulldozer.S | |||
@@ -1,5 +1,5 @@ | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SGEMMKERNEL = gemm_kernel_8x4_sse.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
@@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S | |||
SGEMVTKERNEL = sgemv_t.S | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
DGEMVTKERNEL = dgemv_t_bulldozer.S | |||
@@ -1,5 +1,5 @@ | |||
ZGEMVNKERNEL = zgemv_n_dup.S | |||
ZGEMVTKERNEL = zgemv_t_dup.S | |||
ZGEMVTKERNEL = zgemv_t.S | |||
SGEMMKERNEL = gemm_kernel_8x4_sse3.S | |||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
@@ -21,11 +21,11 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S | |||
CGEMMINCOPY = zgemm_ncopy_2.S | |||
CGEMMITCOPY = zgemm_tcopy_2.S | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
CGEMMKERNEL = cgemm_kernel_8x2_sandy.S | |||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
@@ -522,16 +522,16 @@ | |||
#ifdef WINDOWS_ABI | |||
movq %rdi, 48(%rsp) | |||
movq %rsi, 56(%rsp) | |||
movups %xmm6, 64(%rsp) | |||
movups %xmm7, 80(%rsp) | |||
movups %xmm8, 96(%rsp) | |||
movups %xmm9, 112(%rsp) | |||
movups %xmm10, 128(%rsp) | |||
movups %xmm11, 144(%rsp) | |||
movups %xmm12, 160(%rsp) | |||
movups %xmm13, 176(%rsp) | |||
movups %xmm14, 192(%rsp) | |||
movups %xmm15, 208(%rsp) | |||
vmovups %xmm6, 64(%rsp) | |||
vmovups %xmm7, 80(%rsp) | |||
vmovups %xmm8, 96(%rsp) | |||
vmovups %xmm9, 112(%rsp) | |||
vmovups %xmm10, 128(%rsp) | |||
vmovups %xmm11, 144(%rsp) | |||
vmovups %xmm12, 160(%rsp) | |||
vmovups %xmm13, 176(%rsp) | |||
vmovups %xmm14, 192(%rsp) | |||
vmovups %xmm15, 208(%rsp) | |||
movq ARG1, OLD_M | |||
movq ARG2, OLD_N | |||
@@ -541,14 +541,15 @@ | |||
movq OLD_C, C | |||
movq OLD_LDC, LDC | |||
#ifdef TRMMKERNEL | |||
movsd OLD_OFFSET, %xmm12 | |||
vmovsd OLD_OFFSET, %xmm12 | |||
#endif | |||
vmovaps %xmm3, %xmm0 | |||
vmovsd OLD_ALPHA_I, %xmm1 | |||
#else | |||
movq STACKSIZE + 8(%rsp), LDC | |||
#ifdef TRMMKERNEL | |||
movsd STACKSIZE + 16(%rsp), %xmm12 | |||
vmovsd STACKSIZE + 16(%rsp), %xmm12 | |||
#endif | |||
#endif | |||
@@ -1865,6 +1866,8 @@ | |||
.L999: | |||
vzeroupper | |||
movq SP, %rsp | |||
movq (%rsp), %rbx | |||
movq 8(%rsp), %rbp | |||
@@ -1876,16 +1879,16 @@ | |||
#ifdef WINDOWS_ABI | |||
movq 48(%rsp), %rdi | |||
movq 56(%rsp), %rsi | |||
movups 64(%rsp), %xmm6 | |||
movups 80(%rsp), %xmm7 | |||
movups 96(%rsp), %xmm8 | |||
movups 112(%rsp), %xmm9 | |||
movups 128(%rsp), %xmm10 | |||
movups 144(%rsp), %xmm11 | |||
movups 160(%rsp), %xmm12 | |||
movups 176(%rsp), %xmm13 | |||
movups 192(%rsp), %xmm14 | |||
movups 208(%rsp), %xmm15 | |||
vmovups 64(%rsp), %xmm6 | |||
vmovups 80(%rsp), %xmm7 | |||
vmovups 96(%rsp), %xmm8 | |||
vmovups 112(%rsp), %xmm9 | |||
vmovups 128(%rsp), %xmm10 | |||
vmovups 144(%rsp), %xmm11 | |||
vmovups 160(%rsp), %xmm12 | |||
vmovups 176(%rsp), %xmm13 | |||
vmovups 192(%rsp), %xmm14 | |||
vmovups 208(%rsp), %xmm15 | |||
#endif | |||
addq $STACKSIZE, %rsp | |||
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/********************************************************************* | |||
* | |||
* 2013/10/31 Saar | |||
* 2014/06/28 Saar | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
@@ -546,16 +546,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq %rdi, 48(%rsp) | |||
movq %rsi, 56(%rsp) | |||
movups %xmm6, 64(%rsp) | |||
movups %xmm7, 80(%rsp) | |||
movups %xmm8, 96(%rsp) | |||
movups %xmm9, 112(%rsp) | |||
movups %xmm10, 128(%rsp) | |||
movups %xmm11, 144(%rsp) | |||
movups %xmm12, 160(%rsp) | |||
movups %xmm13, 176(%rsp) | |||
movups %xmm14, 192(%rsp) | |||
movups %xmm15, 208(%rsp) | |||
vmovups %xmm6, 64(%rsp) | |||
vmovups %xmm7, 80(%rsp) | |||
vmovups %xmm8, 96(%rsp) | |||
vmovups %xmm9, 112(%rsp) | |||
vmovups %xmm10, 128(%rsp) | |||
vmovups %xmm11, 144(%rsp) | |||
vmovups %xmm12, 160(%rsp) | |||
vmovups %xmm13, 176(%rsp) | |||
vmovups %xmm14, 192(%rsp) | |||
vmovups %xmm15, 208(%rsp) | |||
movq ARG1, OLD_M | |||
movq ARG2, OLD_N | |||
@@ -568,6 +568,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
movsd OLD_OFFSET, %xmm12 | |||
#endif | |||
vmovaps %xmm3, %xmm0 | |||
vmovsd OLD_ALPHA_I, %xmm1 | |||
#else | |||
movq STACKSIZE + 8(%rsp), LDC | |||
@@ -1889,6 +1890,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.L999: | |||
vzeroupper | |||
movq SP, %rsp | |||
movq (%rsp), %rbx | |||
movq 8(%rsp), %rbp | |||
@@ -1900,16 +1903,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq 48(%rsp), %rdi | |||
movq 56(%rsp), %rsi | |||
movups 64(%rsp), %xmm6 | |||
movups 80(%rsp), %xmm7 | |||
movups 96(%rsp), %xmm8 | |||
movups 112(%rsp), %xmm9 | |||
movups 128(%rsp), %xmm10 | |||
movups 144(%rsp), %xmm11 | |||
movups 160(%rsp), %xmm12 | |||
movups 176(%rsp), %xmm13 | |||
movups 192(%rsp), %xmm14 | |||
movups 208(%rsp), %xmm15 | |||
vmovups 64(%rsp), %xmm6 | |||
vmovups 80(%rsp), %xmm7 | |||
vmovups 96(%rsp), %xmm8 | |||
vmovups 112(%rsp), %xmm9 | |||
vmovups 128(%rsp), %xmm10 | |||
vmovups 144(%rsp), %xmm11 | |||
vmovups 160(%rsp), %xmm12 | |||
vmovups 176(%rsp), %xmm13 | |||
vmovups 192(%rsp), %xmm14 | |||
vmovups 208(%rsp), %xmm15 | |||
#endif | |||
addq $STACKSIZE, %rsp | |||
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
/********************************************************************* | |||
* 2013/11/13 Saar | |||
* 2014/06/28 Saar | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
@@ -816,16 +816,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq %rdi, 48(%rsp) | |||
movq %rsi, 56(%rsp) | |||
movups %xmm6, 64(%rsp) | |||
movups %xmm7, 80(%rsp) | |||
movups %xmm8, 96(%rsp) | |||
movups %xmm9, 112(%rsp) | |||
movups %xmm10, 128(%rsp) | |||
movups %xmm11, 144(%rsp) | |||
movups %xmm12, 160(%rsp) | |||
movups %xmm13, 176(%rsp) | |||
movups %xmm14, 192(%rsp) | |||
movups %xmm15, 208(%rsp) | |||
vmovups %xmm6, 64(%rsp) | |||
vmovups %xmm7, 80(%rsp) | |||
vmovups %xmm8, 96(%rsp) | |||
vmovups %xmm9, 112(%rsp) | |||
vmovups %xmm10, 128(%rsp) | |||
vmovups %xmm11, 144(%rsp) | |||
vmovups %xmm12, 160(%rsp) | |||
vmovups %xmm13, 176(%rsp) | |||
vmovups %xmm14, 192(%rsp) | |||
vmovups %xmm15, 208(%rsp) | |||
movq ARG1, OLD_M | |||
movq ARG2, OLD_N | |||
@@ -838,6 +838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
movsd OLD_OFFSET, %xmm12 | |||
#endif | |||
vmovaps %xmm3, %xmm0 | |||
vmovsd OLD_ALPHA_I, %xmm1 | |||
#else | |||
movq STACKSIZE + 8(%rsp), LDC | |||
@@ -2253,6 +2254,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.L999: | |||
vzeroupper | |||
movq SP, %rsp | |||
movq (%rsp), %rbx | |||
movq 8(%rsp), %rbp | |||
@@ -2264,16 +2267,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq 48(%rsp), %rdi | |||
movq 56(%rsp), %rsi | |||
movups 64(%rsp), %xmm6 | |||
movups 80(%rsp), %xmm7 | |||
movups 96(%rsp), %xmm8 | |||
movups 112(%rsp), %xmm9 | |||
movups 128(%rsp), %xmm10 | |||
movups 144(%rsp), %xmm11 | |||
movups 160(%rsp), %xmm12 | |||
movups 176(%rsp), %xmm13 | |||
movups 192(%rsp), %xmm14 | |||
movups 208(%rsp), %xmm15 | |||
vmovups 64(%rsp), %xmm6 | |||
vmovups 80(%rsp), %xmm7 | |||
vmovups 96(%rsp), %xmm8 | |||
vmovups 112(%rsp), %xmm9 | |||
vmovups 128(%rsp), %xmm10 | |||
vmovups 144(%rsp), %xmm11 | |||
vmovups 160(%rsp), %xmm12 | |||
vmovups 176(%rsp), %xmm13 | |||
vmovups 192(%rsp), %xmm14 | |||
vmovups 208(%rsp), %xmm15 | |||
#endif | |||
addq $ STACKSIZE, %rsp | |||
@@ -412,16 +412,16 @@ | |||
#ifdef WINDOWS_ABI | |||
movq %rdi, 48(%rsp) | |||
movq %rsi, 56(%rsp) | |||
movups %xmm6, 64(%rsp) | |||
movups %xmm7, 80(%rsp) | |||
movups %xmm8, 96(%rsp) | |||
movups %xmm9, 112(%rsp) | |||
movups %xmm10, 128(%rsp) | |||
movups %xmm11, 144(%rsp) | |||
movups %xmm12, 160(%rsp) | |||
movups %xmm13, 176(%rsp) | |||
movups %xmm14, 192(%rsp) | |||
movups %xmm15, 208(%rsp) | |||
vmovups %xmm6, 64(%rsp) | |||
vmovups %xmm7, 80(%rsp) | |||
vmovups %xmm8, 96(%rsp) | |||
vmovups %xmm9, 112(%rsp) | |||
vmovups %xmm10, 128(%rsp) | |||
vmovups %xmm11, 144(%rsp) | |||
vmovups %xmm12, 160(%rsp) | |||
vmovups %xmm13, 176(%rsp) | |||
vmovups %xmm14, 192(%rsp) | |||
vmovups %xmm15, 208(%rsp) | |||
movq ARG1, OLD_M | |||
movq ARG2, OLD_N | |||
@@ -431,14 +431,15 @@ | |||
movq OLD_C, C | |||
movq OLD_LDC, LDC | |||
#ifdef TRMMKERNEL | |||
movsd OLD_OFFSET, %xmm12 | |||
vmovsd OLD_OFFSET, %xmm12 | |||
#endif | |||
vmovaps %xmm3, %xmm0 | |||
vmovsd OLD_ALPHA_I, %xmm1 | |||
#else | |||
movq STACKSIZE + 8(%rsp), LDC | |||
#ifdef TRMMKERNEL | |||
movsd STACKSIZE + 16(%rsp), %xmm12 | |||
vmovsd STACKSIZE + 16(%rsp), %xmm12 | |||
#endif | |||
#endif | |||
@@ -1372,6 +1373,8 @@ | |||
.L999: | |||
vzeroupper | |||
movq SP, %rsp | |||
movq (%rsp), %rbx | |||
movq 8(%rsp), %rbp | |||
@@ -1383,16 +1386,16 @@ | |||
#ifdef WINDOWS_ABI | |||
movq 48(%rsp), %rdi | |||
movq 56(%rsp), %rsi | |||
movups 64(%rsp), %xmm6 | |||
movups 80(%rsp), %xmm7 | |||
movups 96(%rsp), %xmm8 | |||
movups 112(%rsp), %xmm9 | |||
movups 128(%rsp), %xmm10 | |||
movups 144(%rsp), %xmm11 | |||
movups 160(%rsp), %xmm12 | |||
movups 176(%rsp), %xmm13 | |||
movups 192(%rsp), %xmm14 | |||
movups 208(%rsp), %xmm15 | |||
vmovups 64(%rsp), %xmm6 | |||
vmovups 80(%rsp), %xmm7 | |||
vmovups 96(%rsp), %xmm8 | |||
vmovups 112(%rsp), %xmm9 | |||
vmovups 128(%rsp), %xmm10 | |||
vmovups 144(%rsp), %xmm11 | |||
vmovups 160(%rsp), %xmm12 | |||
vmovups 176(%rsp), %xmm13 | |||
vmovups 192(%rsp), %xmm14 | |||
vmovups 208(%rsp), %xmm15 | |||
#endif | |||
addq $STACKSIZE, %rsp | |||
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/********************************************************************* | |||
* | |||
* 2013/10/30 Saar | |||
* 2014/06/28 Saar | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
@@ -437,16 +437,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq %rdi, 48(%rsp) | |||
movq %rsi, 56(%rsp) | |||
movups %xmm6, 64(%rsp) | |||
movups %xmm7, 80(%rsp) | |||
movups %xmm8, 96(%rsp) | |||
movups %xmm9, 112(%rsp) | |||
movups %xmm10, 128(%rsp) | |||
movups %xmm11, 144(%rsp) | |||
movups %xmm12, 160(%rsp) | |||
movups %xmm13, 176(%rsp) | |||
movups %xmm14, 192(%rsp) | |||
movups %xmm15, 208(%rsp) | |||
vmovups %xmm6, 64(%rsp) | |||
vmovups %xmm7, 80(%rsp) | |||
vmovups %xmm8, 96(%rsp) | |||
vmovups %xmm9, 112(%rsp) | |||
vmovups %xmm10, 128(%rsp) | |||
vmovups %xmm11, 144(%rsp) | |||
vmovups %xmm12, 160(%rsp) | |||
vmovups %xmm13, 176(%rsp) | |||
vmovups %xmm14, 192(%rsp) | |||
vmovups %xmm15, 208(%rsp) | |||
movq ARG1, OLD_M | |||
movq ARG2, OLD_N | |||
@@ -456,14 +456,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
movq OLD_C, C | |||
movq OLD_LDC, LDC | |||
#ifdef TRMMKERNEL | |||
movsd OLD_OFFSET, %xmm12 | |||
vmovsd OLD_OFFSET, %xmm12 | |||
#endif | |||
vmovaps %xmm3, %xmm0 | |||
vmovsd OLD_ALPHA_I, %xmm1 | |||
#else | |||
movq STACKSIZE + 8(%rsp), LDC | |||
#ifdef TRMMKERNEL | |||
movsd STACKSIZE + 16(%rsp), %xmm12 | |||
vmovsd STACKSIZE + 16(%rsp), %xmm12 | |||
#endif | |||
#endif | |||
@@ -1397,6 +1398,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.L999: | |||
vzeroupper | |||
movq SP, %rsp | |||
movq (%rsp), %rbx | |||
movq 8(%rsp), %rbp | |||
@@ -1408,16 +1411,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq 48(%rsp), %rdi | |||
movq 56(%rsp), %rsi | |||
movups 64(%rsp), %xmm6 | |||
movups 80(%rsp), %xmm7 | |||
movups 96(%rsp), %xmm8 | |||
movups 112(%rsp), %xmm9 | |||
movups 128(%rsp), %xmm10 | |||
movups 144(%rsp), %xmm11 | |||
movups 160(%rsp), %xmm12 | |||
movups 176(%rsp), %xmm13 | |||
movups 192(%rsp), %xmm14 | |||
movups 208(%rsp), %xmm15 | |||
vmovups 64(%rsp), %xmm6 | |||
vmovups 80(%rsp), %xmm7 | |||
vmovups 96(%rsp), %xmm8 | |||
vmovups 112(%rsp), %xmm9 | |||
vmovups 128(%rsp), %xmm10 | |||
vmovups 144(%rsp), %xmm11 | |||
vmovups 160(%rsp), %xmm12 | |||
vmovups 176(%rsp), %xmm13 | |||
vmovups 192(%rsp), %xmm14 | |||
vmovups 208(%rsp), %xmm15 | |||
#endif | |||
addq $STACKSIZE, %rsp | |||
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
**********************************************************************************/ | |||
/******************************************************************************** | |||
* 2013/11/13 Saar | |||
* 2014/06/28 Saar | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
@@ -693,16 +693,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq %rdi, 48(%rsp) | |||
movq %rsi, 56(%rsp) | |||
movups %xmm6, 64(%rsp) | |||
movups %xmm7, 80(%rsp) | |||
movups %xmm8, 96(%rsp) | |||
movups %xmm9, 112(%rsp) | |||
movups %xmm10, 128(%rsp) | |||
movups %xmm11, 144(%rsp) | |||
movups %xmm12, 160(%rsp) | |||
movups %xmm13, 176(%rsp) | |||
movups %xmm14, 192(%rsp) | |||
movups %xmm15, 208(%rsp) | |||
vmovups %xmm6, 64(%rsp) | |||
vmovups %xmm7, 80(%rsp) | |||
vmovups %xmm8, 96(%rsp) | |||
vmovups %xmm9, 112(%rsp) | |||
vmovups %xmm10, 128(%rsp) | |||
vmovups %xmm11, 144(%rsp) | |||
vmovups %xmm12, 160(%rsp) | |||
vmovups %xmm13, 176(%rsp) | |||
vmovups %xmm14, 192(%rsp) | |||
vmovups %xmm15, 208(%rsp) | |||
movq ARG1, OLD_M | |||
movq ARG2, OLD_N | |||
@@ -715,6 +715,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
movsd OLD_OFFSET, %xmm12 | |||
#endif | |||
vmovaps %xmm3, %xmm0 | |||
vmovsd OLD_ALPHA_I, %xmm1 | |||
#else | |||
movq STACKSIZE + 8(%rsp), LDC | |||
@@ -1781,6 +1782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.L999: | |||
vzeroupper | |||
movq SP, %rsp | |||
movq (%rsp), %rbx | |||
movq 8(%rsp), %rbp | |||
@@ -1792,16 +1795,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef WINDOWS_ABI | |||
movq 48(%rsp), %rdi | |||
movq 56(%rsp), %rsi | |||
movups 64(%rsp), %xmm6 | |||
movups 80(%rsp), %xmm7 | |||
movups 96(%rsp), %xmm8 | |||
movups 112(%rsp), %xmm9 | |||
movups 128(%rsp), %xmm10 | |||
movups 144(%rsp), %xmm11 | |||
movups 160(%rsp), %xmm12 | |||
movups 176(%rsp), %xmm13 | |||
movups 192(%rsp), %xmm14 | |||
movups 208(%rsp), %xmm15 | |||
vmovups 64(%rsp), %xmm6 | |||
vmovups 80(%rsp), %xmm7 | |||
vmovups 96(%rsp), %xmm8 | |||
vmovups 112(%rsp), %xmm9 | |||
vmovups 128(%rsp), %xmm10 | |||
vmovups 144(%rsp), %xmm11 | |||
vmovups 160(%rsp), %xmm12 | |||
vmovups 176(%rsp), %xmm13 | |||
vmovups 192(%rsp), %xmm14 | |||
vmovups 208(%rsp), %xmm15 | |||
#endif | |||
addq $ STACKSIZE, %rsp | |||
@@ -1,7 +1,7 @@ | |||
SHELL = /bin/sh | |||
PLAT = _LINUX | |||
DRVOPTS = $(OPTS) | |||
LOADER = $(FORTRAN) -pthread | |||
LOADER = $(FORTRAN) | |||
ARCHFLAGS= -ru | |||
#RANLIB = ranlib | |||
@@ -1111,14 +1111,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define QGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||
#define XGEMM_DEFAULT_UNROLL_M 1 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define QGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define XGEMM_DEFAULT_UNROLL_N 1 | |||
#endif | |||
@@ -1134,7 +1134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define QGEMM_DEFAULT_P 504 | |||
#define QGEMM_DEFAULT_R qgemm_r | |||
#define CGEMM_DEFAULT_P 128 | |||
#define CGEMM_DEFAULT_P 384 | |||
//#define CGEMM_DEFAULT_R cgemm_r | |||
#define CGEMM_DEFAULT_R 1024 | |||
@@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SGEMM_DEFAULT_Q 384 | |||
#define DGEMM_DEFAULT_Q 256 | |||
#define QGEMM_DEFAULT_Q 128 | |||
#define CGEMM_DEFAULT_Q 256 | |||
#define CGEMM_DEFAULT_Q 192 | |||
#define ZGEMM_DEFAULT_Q 192 | |||
#define XGEMM_DEFAULT_Q 128 | |||