You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

smallscaling.c 5.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. // run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
  2. #include <math.h>
  3. #include <stdlib.h>
  4. #include <stdio.h>
  5. #include <time.h>
  6. #include <cblas.h>
  7. #include <omp.h>
  8. #include <pthread.h>
  9. #define MIN_SIZE 5
  10. #define MAX_SIZE 60
  11. #define NB_SIZE 10
  12. // number of loop for a 1x1 matrix. Lower it if the test is
  13. // too slow on you computer.
  14. #define NLOOP 2e7
  15. typedef struct {
  16. int matrix_size;
  17. int n_loop;
  18. void (* bench_func)();
  19. void (* blas_func)();
  20. void * (* create_matrix)(int size);
  21. } BenchParam;
  22. void * s_create_matrix(int size) {
  23. float * r = malloc(size * sizeof(double));
  24. int i;
  25. for(i = 0; i < size; i++)
  26. r[i] = 1e3 * i / size;
  27. return r;
  28. }
  29. void * c_create_matrix(int size) {
  30. float * r = malloc(size * 2 * sizeof(double));
  31. int i;
  32. for(i = 0; i < 2 * size; i++)
  33. r[i] = 1e3 * i / size;
  34. return r;
  35. }
  36. void * z_create_matrix(int size) {
  37. double * r = malloc(size * 2 * sizeof(double));
  38. int i;
  39. for(i = 0; i < 2 * size; i++)
  40. r[i] = 1e3 * i / size;
  41. return r;
  42. }
  43. void * d_create_matrix(int size) {
  44. double * r = malloc(size * sizeof(double));
  45. int i;
  46. for(i = 0; i < size; i++)
  47. r[i] = 1e3 * i / size;
  48. return r;
  49. }
  50. void trmv_bench(BenchParam * param)
  51. {
  52. int i, n;
  53. int size = param->matrix_size;
  54. n = param->n_loop / size;
  55. int one = 1;
  56. void * A = param->create_matrix(size * size);
  57. void * y = param->create_matrix(size);
  58. for(i = 0; i < n; i++) {
  59. param->blas_func("U", "N", "N", &size, A, &size, y, &one);
  60. }
  61. free(A);
  62. free(y);
  63. }
  64. void gemv_bench(BenchParam * param)
  65. {
  66. int i, n;
  67. int size = param->matrix_size;
  68. n = param->n_loop / size;
  69. double v = 1.01;
  70. int one = 1;
  71. void * A = param->create_matrix(size * size);
  72. void * y = param->create_matrix(size);
  73. for(i = 0; i < n; i++) {
  74. param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
  75. }
  76. free(A);
  77. free(y);
  78. }
  79. void ger_bench(BenchParam * param) {
  80. int i, n;
  81. int size = param->matrix_size;
  82. n = param->n_loop / size;
  83. double v = 1.01;
  84. int one = 1;
  85. void * A = param->create_matrix(size * size);
  86. void * y = param->create_matrix(size);
  87. for(i = 0; i < n; i++) {
  88. param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
  89. }
  90. free(A);
  91. free(y);
  92. }
  93. #ifndef _WIN32
  94. void * pthread_func_wrapper(void * param) {
  95. ((BenchParam *)param)->bench_func(param);
  96. pthread_exit(NULL);
  97. }
  98. #endif
  99. #define NB_TESTS 5
  100. void * TESTS[4 * NB_TESTS] = {
  101. trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
  102. gemv_bench, dgemv_, d_create_matrix, "dgemv",
  103. gemv_bench, zgemv_, z_create_matrix, "zgemv",
  104. ger_bench, dger_, d_create_matrix, "dger",
  105. ger_bench, zgerc_, z_create_matrix, "zgerc",
  106. };
  107. inline static double delta_time(struct timespec tick) {
  108. struct timespec tock;
  109. clock_gettime(CLOCK_MONOTONIC, &tock);
  110. return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
  111. }
  112. double pthread_bench(BenchParam * param, int nb_threads)
  113. {
  114. #ifdef _WIN32
  115. return 0;
  116. #else
  117. BenchParam threaded_param = *param;
  118. pthread_t threads[nb_threads];
  119. int t, rc;
  120. struct timespec tick;
  121. threaded_param.n_loop /= nb_threads;
  122. clock_gettime(CLOCK_MONOTONIC, &tick);
  123. for(t=0; t<nb_threads; t++){
  124. rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
  125. if (rc){
  126. printf("ERROR; return code from pthread_create() is %d\n", rc);
  127. exit(-1);
  128. }
  129. }
  130. for(t=0; t<nb_threads; t++){
  131. pthread_join(threads[t], NULL);
  132. }
  133. return delta_time(tick);
  134. #endif
  135. }
  136. double seq_bench(BenchParam * param) {
  137. struct timespec tick;
  138. clock_gettime(CLOCK_MONOTONIC, &tick);
  139. param->bench_func(param);
  140. return delta_time(tick);
  141. }
  142. double omp_bench(BenchParam * param) {
  143. BenchParam threaded_param = *param;
  144. struct timespec tick;
  145. int t;
  146. int nb_threads = omp_get_max_threads();
  147. threaded_param.n_loop /= nb_threads;
  148. clock_gettime(CLOCK_MONOTONIC, &tick);
  149. #pragma omp parallel for
  150. for(t = 0; t < nb_threads; t ++){
  151. param->bench_func(&threaded_param);
  152. }
  153. return delta_time(tick);
  154. }
  155. int main(int argc, char * argv[]) {
  156. double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
  157. BenchParam param;
  158. int test_id;
  159. printf ("Running on %d threads\n", omp_get_max_threads());
  160. for(test_id = 0; test_id < NB_TESTS; test_id ++) {
  161. double size = MIN_SIZE;
  162. param.bench_func = TESTS[test_id * 4];
  163. param.blas_func = TESTS[test_id * 4 + 1];
  164. param.create_matrix = TESTS[test_id * 4 + 2];
  165. printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
  166. param.n_loop = NLOOP;
  167. while(size <= MAX_SIZE) {
  168. param.matrix_size = (int)(size + 0.5);
  169. double seq_time = seq_bench(&param);
  170. double omp_time = omp_bench(&param);
  171. double pthread_time = pthread_bench(&param, omp_get_max_threads());
  172. printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
  173. "pthread %gs, speedup %g\n",
  174. param.matrix_size, seq_time,
  175. omp_time, seq_time / omp_time,
  176. pthread_time, seq_time / pthread_time);
  177. size *= inc_factor;
  178. }
  179. }
  180. return(0);
  181. }