You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot.c 5.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. /***************************************************************************
  2. Copyright (c) 2017, The OpenBLAS Project
  3. Copyright (c) 2022, Arm Ltd
  4. All rights reserved.
  5. Redistribution and use in source and binary forms, with or without
  6. modification, are permitted provided that the following conditions are
  7. met:
  8. 1. Redistributions of source code must retain the above copyright
  9. notice, this list of conditions and the following disclaimer.
  10. 2. Redistributions in binary form must reproduce the above copyright
  11. notice, this list of conditions and the following disclaimer in
  12. the documentation and/or other materials provided with the
  13. distribution.
  14. 3. Neither the name of the OpenBLAS project nor the names of
  15. its contributors may be used to endorse or promote products
  16. derived from this software without specific prior written permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. *****************************************************************************/
  28. #include "common.h"
  29. // Some compilers will report feature support for SVE without the appropriate
  30. // header available
  31. #ifdef HAVE_SVE
  32. #if defined __has_include
  33. #if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE
  34. #define USE_SVE
  35. #endif
  36. #endif
  37. #endif
  38. #ifdef USE_SVE
  39. #ifdef DOT_KERNEL_SVE
  40. #include DOT_KERNEL_SVE
  41. #else
  42. #include "dot_kernel_sve.c"
  43. #endif
  44. #endif
  45. #include "dot_kernel_asimd.c"
  46. #if defined(SMP)
  47. extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
  48. BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
  49. void *c, BLASLONG ldc, int (*function)(), int nthreads);
  50. #ifdef DYNAMIC_ARCH
  51. extern char* gotoblas_corename(void);
  52. #endif
  53. #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1)
  54. static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) {
  55. #ifdef DOUBLE
  56. return (N <= 10000L) ? 1
  57. : (N <= 64500L) ? 1
  58. : (N <= 100000L) ? MIN(ncpu, 2)
  59. : (N <= 150000L) ? MIN(ncpu, 4)
  60. : (N <= 260000L) ? MIN(ncpu, 8)
  61. : (N <= 360000L) ? MIN(ncpu, 16)
  62. : (N <= 520000L) ? MIN(ncpu, 24)
  63. : (N <= 1010000L) ? MIN(ncpu, 56)
  64. : ncpu;
  65. #else
  66. return (N <= 10000L) ? 1
  67. : (N <= 110000L) ? 1
  68. : (N <= 200000L) ? MIN(ncpu, 2)
  69. : (N <= 280000L) ? MIN(ncpu, 4)
  70. : (N <= 520000L) ? MIN(ncpu, 8)
  71. : (N <= 830000L) ? MIN(ncpu, 16)
  72. : (N <= 1010000L) ? MIN(ncpu, 24)
  73. : ncpu;
  74. #endif
  75. }
  76. #endif
  77. static inline int get_dot_optimal_nthreads(BLASLONG n) {
  78. int ncpu = num_cpu_avail(1);
  79. #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16)
  80. return get_dot_optimal_nthreads_neoversev1(n, ncpu);
  81. #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16)
  82. if (strcmp(gotoblas_corename(), "neoversev1") == 0) {
  83. return get_dot_optimal_nthreads_neoversev1(n, ncpu);
  84. }
  85. #endif
  86. // Default case
  87. if (n <= 10000L)
  88. return 1;
  89. else
  90. return num_cpu_avail(1);
  91. }
  92. #endif
  93. static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
  94. {
  95. RETURN_TYPE dot = 0.0 ;
  96. if ( n <= 0 ) return dot;
  97. #ifdef USE_SVE
  98. if (inc_x == 1 && inc_y == 1) {
  99. return dot_kernel_sve(n, x, y);
  100. }
  101. #endif
  102. return dot_kernel_asimd(n, x, inc_x, y, inc_y);
  103. }
  104. #if defined(SMP)
  105. static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
  106. BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
  107. BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
  108. {
  109. *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
  110. return 0;
  111. }
  112. #endif
  113. RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
  114. {
  115. #if defined(SMP)
  116. int nthreads;
  117. FLOAT dummy_alpha;
  118. #endif
  119. RETURN_TYPE dot = 0.0;
  120. #if defined(SMP)
  121. if (inc_x == 0 || inc_y == 0)
  122. nthreads = 1;
  123. else
  124. nthreads = get_dot_optimal_nthreads(n);
  125. if (nthreads == 1) {
  126. dot = dot_compute(n, x, inc_x, y, inc_y);
  127. } else {
  128. int mode, i;
  129. char result[MAX_CPU_NUMBER * sizeof(double) * 2];
  130. RETURN_TYPE *ptr;
  131. #if !defined(DOUBLE)
  132. mode = BLAS_SINGLE | BLAS_REAL;
  133. #else
  134. mode = BLAS_DOUBLE | BLAS_REAL;
  135. #endif
  136. blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
  137. x, inc_x, y, inc_y, result, 0,
  138. (void *)dot_thread_function, nthreads);
  139. ptr = (RETURN_TYPE *)result;
  140. for (i = 0; i < nthreads; i++) {
  141. dot = dot + (*ptr);
  142. ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
  143. }
  144. }
  145. #else
  146. dot = dot_compute(n, x, inc_x, y, inc_y);
  147. #endif
  148. return dot;
  149. }