You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsum.c 2.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. #include "common.h"
  2. #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
  3. #include "zsum_microk_skylakex-2.c"
  4. #endif
  5. #ifndef HAVE_ZASUM_KERNEL
  6. static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
  7. {
  8. BLASLONG i=0;
  9. BLASLONG n_8 = n & -8;
  10. FLOAT *x1 = x;
  11. FLOAT temp0, temp1, temp2, temp3;
  12. FLOAT temp4, temp5, temp6, temp7;
  13. FLOAT sum0 = 0.0;
  14. FLOAT sum1 = 0.0;
  15. FLOAT sum2 = 0.0;
  16. FLOAT sum3 = 0.0;
  17. FLOAT sum4 = 0.0;
  18. while (i < n_8) {
  19. sum0 += x1[0];
  20. sum1 += x1[1];
  21. sum2 += x1[2];
  22. sum3 += x1[3];
  23. sum0 += x1[4];
  24. sum1 += x1[5];
  25. sum2 += x1[6];
  26. sum3 += x1[7];
  27. x1+=8;
  28. i+=4;
  29. }
  30. while (i < n) {
  31. sum4 += x1[0] + x1[1];
  32. x1 += 2;
  33. i++;
  34. }
  35. return sum0+sum1+sum2+sum3+sum4;
  36. }
  37. #endif
  38. static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
  39. {
  40. BLASLONG i = 0;
  41. BLASLONG ip = 0;
  42. BLASLONG inc_x2;
  43. FLOAT sumf = 0.0;
  44. if (n <= 0 || inc_x <= 0) return(sumf);
  45. if (inc_x == 1) {
  46. sumf = zasum_kernel(n, x);
  47. }
  48. else {
  49. inc_x2 = 2 * inc_x;
  50. while (i < n) {
  51. sumf += x[ip] + x[ip + 1];
  52. ip += inc_x2;
  53. i++;
  54. }
  55. }
  56. return(sumf);
  57. }
  58. #if defined(SMP)
  59. static int sum_thread_function(BLASLONG n,
  60. BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
  61. FLOAT *x, BLASLONG inc_x,
  62. FLOAT * dummy3, BLASLONG dummy4,
  63. FLOAT * result, BLASLONG dummy5)
  64. {
  65. *(FLOAT *) result = sum_compute(n, x, inc_x);
  66. return 0;
  67. }
  68. extern int blas_level1_thread_with_return_value(int mode,
  69. BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
  70. void *a, BLASLONG lda,
  71. void *b, BLASLONG ldb,
  72. void *c, BLASLONG ldc,
  73. int (*function)(),
  74. int nthread);
  75. #endif
  76. FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
  77. {
  78. #if defined(SMP)
  79. int nthreads;
  80. FLOAT dummy_alpha[2];
  81. #endif
  82. FLOAT sumf = 0.0;
  83. #if defined(SMP)
  84. int num_cpu = num_cpu_avail(1);
  85. if (n <= 10000 || inc_x <= 0)
  86. nthreads = 1;
  87. else
  88. nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
  89. if (nthreads == 1) {
  90. sumf = sum_compute(n, x, inc_x);
  91. }
  92. else {
  93. int mode, i;
  94. char result[MAX_CPU_NUMBER * sizeof(double) *2];
  95. FLOAT *ptr;
  96. #if !defined(DOUBLE)
  97. mode = BLAS_SINGLE | BLAS_COMPLEX;
  98. #else
  99. mode = BLAS_DOUBLE | BLAS_COMPLEX;
  100. #endif
  101. blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
  102. NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads);
  103. ptr = (FLOAT *)result;
  104. for (i = 0; i < nthreads; i++) {
  105. sumf += (*ptr);
  106. ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
  107. }
  108. }
  109. #else
  110. sumf = sum_compute(n, x, inc_x);
  111. #endif
  112. return(sumf);
  113. }