@@ -79,6 +79,11 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||||
static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | ||||
int ncpu = num_cpu_avail(3); | int ncpu = num_cpu_avail(3); | ||||
#if defined(_WIN64) && defined(_M_ARM64) | |||||
if (MN > 100000000L) | |||||
return num_cpu_avail(4); | |||||
return 1; | |||||
#endif | |||||
#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | ||||
return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | ||||
#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | ||||
@@ -117,13 +117,15 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, | |||||
#if defined(_WIN64) && defined(_M_ARM64) | #if defined(_WIN64) && defined(_M_ARM64) | ||||
#ifdef COMPLEX | #ifdef COMPLEX | ||||
if (args.m * args.n > 600) | |||||
if (args.m * args.n <= 300) | |||||
#else | #else | ||||
if (args.m * args.n > 1000) | |||||
if (args.m * args.n <= 500) | |||||
#endif | #endif | ||||
args.nthreads = num_cpu_avail(4); | |||||
else | |||||
args.nthreads = 1; | args.nthreads = 1; | ||||
else if (args.m * args.n <= 1000) | |||||
args.nthreads = 4; | |||||
else | |||||
args.nthreads = num_cpu_avail(4); | |||||
#else | #else | ||||
#ifndef DOUBLE | #ifndef DOUBLE | ||||
if (args.m * args.n < 40000) | if (args.m * args.n < 40000) | ||||
@@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order, | |||||
#ifdef SMP | #ifdef SMP | ||||
if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) | |||||
#if defined(_WIN64) && defined(_M_ARM64) | |||||
if (m*n > 25000000L) | |||||
nthreads = num_cpu_avail(4); | |||||
else | |||||
nthreads = 1; | |||||
#else | |||||
if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD) | |||||
nthreads = 1; | nthreads = 1; | ||||
else | else | ||||
nthreads = num_cpu_avail(2); | nthreads = num_cpu_avail(2); | ||||
#endif | |||||
if (nthreads == 1) { | if (nthreads == 1) { | ||||
#endif | |||||
#endif | |||||
(gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); | (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); | ||||
#ifdef SMP | #ifdef SMP | ||||
} else { | } else { | ||||
(gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); | (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); | ||||
} | } | ||||
#endif | #endif | ||||
STACK_FREE(buffer); | STACK_FREE(buffer); | ||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); | ||||