Browse Source

Merge branch 'release-v0.1alpha2' into develop

tags/v0.1alpha2.4^2
Xianyi Zhang 14 years ago
parent
commit
ca8bf5abb0
15 changed files with 12575 additions and 19 deletions
  1. +5
    -2
      Changelog.txt
  2. +1
    -1
      Makefile
  3. +2
    -1
      README
  4. +5
    -0
      common_mips64.h
  5. +1
    -1
      driver/others/blas_server_omp.c
  6. +5
    -4
      exports/Makefile
  7. +22
    -0
      kernel/mips64/KERNEL
  8. +22
    -0
      kernel/mips64/KERNEL.LOONGSON3A
  9. +2390
    -0
      kernel/mips64/gemm_kernel_loongson3a.S
  10. +2579
    -0
      kernel/mips64/sgemm_kernel_loongson3a.S
  11. +1938
    -0
      kernel/mips64/trsm_kernel_LN_loongson3a.S
  12. +1783
    -0
      kernel/mips64/trsm_kernel_LT_loongson3a.S
  13. +1852
    -0
      kernel/mips64/trsm_kernel_RN_loongson3a.S
  14. +1958
    -0
      kernel/mips64/trsm_kernel_RT_loongson3a.S
  15. +12
    -10
      param.h

+ 5
- 2
Changelog.txt View File

@@ -1,7 +1,8 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
==================================================================== ====================================================================
Version 0.1 alpha2(in development)
0;136;0c
Version 0.1 alpha2
23-Jun-2011

common: common:
* Fixed blasint undefined bug in <cblas.h> file. Other software * Fixed blasint undefined bug in <cblas.h> file. Other software
could include this header successfully(Refs issue #13 on github) could include this header successfully(Refs issue #13 on github)
@@ -31,6 +32,8 @@ x86/x86_64:


MIPS64: MIPS64:
* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
* Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
* Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)


==================================================================== ====================================================================
Version 0.1 alpha1 Version 0.1 alpha1


+ 1
- 1
Makefile View File

@@ -74,7 +74,7 @@ ifeq ($(OSNAME), Darwin)
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
# -ln -fs $(LIBDLLNAME) libopenblas.dll
-ln -fs $(LIBDLLNAME) libopenblas.dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll $(MAKE) -C exports dll


+ 2
- 1
README View File

@@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
9.Known Issues 9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit * The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32. is 64. On 32 bits, it is 32.
* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully.


10. Specification of Git Branches 10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
@@ -79,4 +80,4 @@ Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state. * The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release. * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future. * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages
* The gh-pages branch. This is for web pages

+ 5
- 0
common_mips64.h View File

@@ -220,6 +220,11 @@ REALNAME: ;\


#define BUFFER_SIZE ( 8 << 20) #define BUFFER_SIZE ( 8 << 20)


#if defined(LOONGSON3A)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif

#ifndef PAGESIZE #ifndef PAGESIZE
#define PAGESIZE (64UL << 10) #define PAGESIZE (64UL << 10)
#endif #endif


+ 1
- 1
driver/others/blas_server_omp.c View File

@@ -38,7 +38,7 @@


#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/mman.h>
//#include <sys/mman.h>
#include "common.h" #include "common.h"


#ifndef USE_OPENMP #ifndef USE_OPENMP


+ 5
- 4
exports/Makefile View File

@@ -53,18 +53,19 @@ dyn : $(LIBDYNNAME)
zip : dll zip : dll
zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME)


dll : libgoto2.dll
dll : ../$(LIBDLLNAME)
#libgoto2.dll


dll2 : libgoto2_shared.dll dll2 : libgoto2_shared.dll


libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME) $(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1) ifeq ($(BINARY32), 1)
$(DLLWRAP) -o $(@F) --def libgoto2.def \
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:i386 /def:libgoto2.def -lib /machine:i386 /def:libgoto2.def
else else
$(DLLWRAP) -o $(@F) --def libgoto2.def \
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:X64 /def:libgoto2.def -lib /machine:X64 /def:libgoto2.def
endif endif


+ 22
- 0
kernel/mips64/KERNEL View File

@@ -91,15 +91,37 @@ ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c
endif endif


ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = trsm_kernel_LN.S STRSMKERNEL_LN = trsm_kernel_LN.S
endif

ifndef STRSMKERNEL_LT
STRSMKERNEL_LT = trsm_kernel_LT.S STRSMKERNEL_LT = trsm_kernel_LT.S
endif

ifndef STRSMKERNEL_RN
STRSMKERNEL_RN = trsm_kernel_LT.S STRSMKERNEL_RN = trsm_kernel_LT.S
endif

ifndef STRSMKERNEL_RT
STRSMKERNEL_RT = trsm_kernel_RT.S STRSMKERNEL_RT = trsm_kernel_RT.S
endif


ifndef DTRSMKERNEL_LN
DTRSMKERNEL_LN = trsm_kernel_LN.S DTRSMKERNEL_LN = trsm_kernel_LN.S
endif

ifndef DTRSMKERNEL_LT
DTRSMKERNEL_LT = trsm_kernel_LT.S DTRSMKERNEL_LT = trsm_kernel_LT.S
endif

ifndef DTRSMKERNEL_RN
DTRSMKERNEL_RN = trsm_kernel_LT.S DTRSMKERNEL_RN = trsm_kernel_LT.S
endif

ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S DTRSMKERNEL_RT = trsm_kernel_RT.S
endif


CTRSMKERNEL_LN = ztrsm_kernel_LT.S CTRSMKERNEL_LN = ztrsm_kernel_LT.S
CTRSMKERNEL_LT = ztrsm_kernel_LT.S CTRSMKERNEL_LT = ztrsm_kernel_LT.S


+ 22
- 0
kernel/mips64/KERNEL.LOONGSON3A View File

@@ -1,2 +1,24 @@
SAXPYKERNEL=axpy_loongson3a.S SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S DAXPYKERNEL=daxpy_loongson3a_simd.S

SGEMMKERNEL = sgemm_kernel_loongson3a.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

DGEMMKERNEL = gemm_kernel_loongson3a.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

+ 2390
- 0
kernel/mips64/gemm_kernel_loongson3a.S
File diff suppressed because it is too large
View File


+ 2579
- 0
kernel/mips64/sgemm_kernel_loongson3a.S
File diff suppressed because it is too large
View File


+ 1938
- 0
kernel/mips64/trsm_kernel_LN_loongson3a.S
File diff suppressed because it is too large
View File


+ 1783
- 0
kernel/mips64/trsm_kernel_LT_loongson3a.S
File diff suppressed because it is too large
View File


+ 1852
- 0
kernel/mips64/trsm_kernel_RN_loongson3a.S
File diff suppressed because it is too large
View File


+ 1958
- 0
kernel/mips64/trsm_kernel_RT_loongson3a.S
File diff suppressed because it is too large
View File


+ 12
- 10
param.h View File

@@ -1480,27 +1480,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL #define GEMM_DEFAULT_ALIGN 0x03fffUL


#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4

#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4

#define CGEMM_DEFAULT_UNROLL_M 1 #define CGEMM_DEFAULT_UNROLL_M 1
#define CGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4


#define SGEMM_DEFAULT_P 108
#define DGEMM_DEFAULT_P 112
#define SGEMM_DEFAULT_P 32
#define DGEMM_DEFAULT_P 32
#define CGEMM_DEFAULT_P 108 #define CGEMM_DEFAULT_P 108
#define ZGEMM_DEFAULT_P 112 #define ZGEMM_DEFAULT_P 112


#define SGEMM_DEFAULT_Q 288
#define DGEMM_DEFAULT_Q 144
#define SGEMM_DEFAULT_Q 116
#define DGEMM_DEFAULT_Q 116
#define CGEMM_DEFAULT_Q 144 #define CGEMM_DEFAULT_Q 144
#define ZGEMM_DEFAULT_Q 72 #define ZGEMM_DEFAULT_Q 72


#define SGEMM_DEFAULT_R 2000
#define DGEMM_DEFAULT_R 2000
#define SGEMM_DEFAULT_R 1000
#define DGEMM_DEFAULT_R 1000
#define CGEMM_DEFAULT_R 2000 #define CGEMM_DEFAULT_R 2000
#define ZGEMM_DEFAULT_R 2000 #define ZGEMM_DEFAULT_R 2000




Loading…
Cancel
Save