Browse Source

Modify aligned address of sa and sb to improve the performance of multi-threads.

tags/v0.1.0^2
traz 14 years ago
parent
commit
831858b883
3 changed files with 13 additions and 24 deletions
  1. +3
    -3
      driver/level3/gemm_thread_n.c
  2. +2
    -2
      driver/others/parameter.c
  3. +8
    -19
      param.h

+ 3
- 3
driver/level3/gemm_thread_n.c View File

@@ -71,15 +71,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg; queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m; queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].range_n = &range[num_cpu];
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; //NULL;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;//NULL;
queue[num_cpu].next = &queue[num_cpu + 1]; queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++; num_cpu ++;
} }
if (num_cpu) { if (num_cpu) {
queue[0].sa = sa; queue[0].sa = sa;
queue[0].sb = sb;
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;


queue[num_cpu - 1].next = NULL; queue[num_cpu - 1].next = NULL;


+ 2
- 2
driver/others/parameter.c View File

@@ -688,11 +688,11 @@ void blas_set_parameter(void){
if(blas_num_threads == 1){ if(blas_num_threads == 1){
#endif #endif
//single thread //single thread
dgemm_r = 1000;
dgemm_r = 1024;
#ifdef SMP #ifdef SMP
}else{ }else{
//multi thread //multi thread
dgemm_r = 300;
dgemm_r = 200;
} }
#endif #endif
#endif #endif


+ 8
- 19
param.h View File

@@ -1493,33 +1493,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2


#define SGEMM_DEFAULT_P 64 #define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 32
#define DGEMM_DEFAULT_P 44
#define CGEMM_DEFAULT_P 64 #define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_P 32 #define ZGEMM_DEFAULT_P 32


#define SGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 112
#define CGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 92
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 80 #define ZGEMM_DEFAULT_Q 80


#define SGEMM_DEFAULT_R 1024 #define SGEMM_DEFAULT_R 1024
//#define DGEMM_DEFAULT_R 300
//#define DGEMM_DEFAULT_R 200
//#define DGEMM_DEFAULT_R 400
//#define DGEMM_DEFAULT_R 192
#define DGEMM_DEFAULT_R dgemm_r
//1000
//#define DGEMM_DEFAULT_R 160
//#define DGEMM_DEFAULT_R 270
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 1024 #define CGEMM_DEFAULT_R 1024
//#define ZGEMM_DEFAULT_R 1000
#define ZGEMM_DEFAULT_R 1000

#define GEMM_OFFSET_A1 (DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*SIZE)
//#define GEMM_OFFSET_B1 0x10
#define GEMM_OFFSET_B1 (DGEMM_DEFAULT_Q*DGEMM_DEFAULT_R*SIZE)
#define GEMM_OFFSET 0x100000
#define GEMM_OFFSET1 0x40000
#define ZGEMM_DEFAULT_R 1024

#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000


#define SYMV_P 16 #define SYMV_P 16
#endif #endif


Loading…
Cancel
Save