You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

blas_server.c 33 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #include "common.h"
  66. #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
  67. #include <dlfcn.h>
  68. #include <errno.h>
  69. #include <signal.h>
  70. #include <sys/resource.h>
  71. #include <sys/time.h>
  72. #endif
  73. #ifndef likely
  74. #ifdef __GNUC__
  75. #define likely(x) __builtin_expect(!!(x), 1)
  76. #else
  77. #define likely(x) (x)
  78. #endif
  79. #endif
  80. #ifndef unlikely
  81. #ifdef __GNUC__
  82. #define unlikely(x) __builtin_expect(!!(x), 0)
  83. #else
  84. #define unlikely(x) (x)
  85. #endif
  86. #endif
  87. extern unsigned int openblas_thread_timeout(void);
  88. #ifdef SMP_SERVER
  89. #undef MONITOR
  90. #undef TIMING
  91. #undef TIMING_DEBUG
  92. #undef NEED_STACKATTR
  93. #define ATTRIBUTE_SIZE 128
  94. /* This is a thread server model implementation. The threads are */
  95. /* spawned at first access to blas library, and still remains until */
  96. /* destruction routine is called. The number of threads are */
  97. /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
  98. /* jobs is queued. */
  99. /* We need this global for checking if initialization is finished. */
  100. int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
  101. int blas_omp_threads_local = 1;
  102. static void * blas_thread_buffer[MAX_CPU_NUMBER];
  103. /* Local Variables */
  104. #if defined(USE_PTHREAD_LOCK)
  105. static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
  106. #elif defined(USE_PTHREAD_SPINLOCK)
  107. static pthread_spinlock_t server_lock = 0;
  108. #else
  109. static unsigned long server_lock = 0;
  110. #endif
  111. #define THREAD_STATUS_SLEEP 2
  112. #define THREAD_STATUS_WAKEUP 4
  113. static pthread_t blas_threads [MAX_CPU_NUMBER];
  114. typedef struct {
  115. blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE)));
  116. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  117. int node;
  118. #endif
  119. volatile long status;
  120. pthread_mutex_t lock;
  121. pthread_cond_t wakeup;
  122. } thread_status_t;
  123. #ifdef HAVE_C11
  124. #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED)
  125. #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
  126. #else
  127. #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p))
  128. #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v))
  129. #endif
  130. static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
  131. #ifndef THREAD_TIMEOUT
  132. #define THREAD_TIMEOUT 28
  133. #endif
  134. static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
  135. #ifdef MONITOR
  136. /* Monitor is a function to see thread's status for every second. */
  137. /* Usually it turns off and it's for debugging. */
  138. static pthread_t monitor_thread;
  139. static int main_status[MAX_CPU_NUMBER];
  140. #define MAIN_ENTER 0x01
  141. #define MAIN_EXIT 0x02
  142. #define MAIN_TRYLOCK 0x03
  143. #define MAIN_LOCKSUCCESS 0x04
  144. #define MAIN_QUEUING 0x05
  145. #define MAIN_RECEIVING 0x06
  146. #define MAIN_RUNNING1 0x07
  147. #define MAIN_RUNNING2 0x08
  148. #define MAIN_RUNNING3 0x09
  149. #define MAIN_WAITING 0x0a
  150. #define MAIN_SLEEPING 0x0b
  151. #define MAIN_FINISH 0x0c
  152. #define MAIN_DONE 0x0d
  153. #endif
  154. #define BLAS_QUEUE_FINISHED 3
  155. #define BLAS_QUEUE_RUNNING 4
  156. #ifdef TIMING
  157. BLASLONG exit_time[MAX_CPU_NUMBER];
  158. #endif
  159. //Prototypes
  160. static void exec_threads(int , blas_queue_t *, int);
  161. static void adjust_thread_buffers();
  162. static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
  163. if (!(mode & BLAS_COMPLEX)){
  164. #ifdef EXPRECISION
  165. if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
  166. /* REAL / Extended Double */
  167. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
  168. xdouble *, BLASLONG, xdouble *, BLASLONG,
  169. xdouble *, BLASLONG, void *) = func;
  170. afunc(args -> m, args -> n, args -> k,
  171. ((xdouble *)args -> alpha)[0],
  172. args -> a, args -> lda,
  173. args -> b, args -> ldb,
  174. args -> c, args -> ldc, sb);
  175. } else
  176. #endif
  177. if ((mode & BLAS_PREC) == BLAS_DOUBLE){
  178. /* REAL / Double */
  179. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
  180. double *, BLASLONG, double *, BLASLONG,
  181. double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG,
  182. double *, BLASLONG, double *, BLASLONG, void *)) func;
  183. afunc(args -> m, args -> n, args -> k,
  184. ((double *)args -> alpha)[0],
  185. args -> a, args -> lda,
  186. args -> b, args -> ldb,
  187. args -> c, args -> ldc, sb);
  188. } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
  189. /* REAL / Single */
  190. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
  191. float *, BLASLONG, float *, BLASLONG,
  192. float *, BLASLONG, void *) = (void (*)
  193. (BLASLONG, BLASLONG, BLASLONG, float,
  194. float *, BLASLONG, float *, BLASLONG,
  195. float *, BLASLONG, void *)) func;
  196. afunc(args -> m, args -> n, args -> k,
  197. ((float *)args -> alpha)[0],
  198. args -> a, args -> lda,
  199. args -> b, args -> ldb,
  200. args -> c, args -> ldc, sb);
  201. #ifdef BUILD_BFLOAT16
  202. } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
  203. /* REAL / BFLOAT16 */
  204. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
  205. bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
  206. bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
  207. bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
  208. bfloat16 *, BLASLONG, void *)) func;
  209. afunc(args -> m, args -> n, args -> k,
  210. ((bfloat16 *)args -> alpha)[0],
  211. args -> a, args -> lda,
  212. args -> b, args -> ldb,
  213. args -> c, args -> ldc, sb);
  214. } else if ((mode & BLAS_PREC) == BLAS_STOBF16){
  215. /* REAL / BLAS_STOBF16 */
  216. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
  217. float *, BLASLONG, bfloat16 *, BLASLONG,
  218. float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float,
  219. float *, BLASLONG, bfloat16 *, BLASLONG,
  220. float *, BLASLONG, void *)) func;
  221. afunc(args -> m, args -> n, args -> k,
  222. ((float *)args -> alpha)[0],
  223. args -> a, args -> lda,
  224. args -> b, args -> ldb,
  225. args -> c, args -> ldc, sb);
  226. } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
  227. /* REAL / BLAS_DTOBF16 */
  228. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
  229. double *, BLASLONG, bfloat16 *, BLASLONG,
  230. double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double,
  231. double *, BLASLONG, bfloat16 *, BLASLONG,
  232. double *, BLASLONG, void *)) func;
  233. afunc(args -> m, args -> n, args -> k,
  234. ((double *)args -> alpha)[0],
  235. args -> a, args -> lda,
  236. args -> b, args -> ldb,
  237. args -> c, args -> ldc, sb);
  238. #endif
  239. } else {
  240. /* REAL / Other types in future */
  241. }
  242. } else {
  243. #ifdef EXPRECISION
  244. if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
  245. /* COMPLEX / Extended Double */
  246. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
  247. xdouble *, BLASLONG, xdouble *, BLASLONG,
  248. xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
  249. xdouble *, BLASLONG, xdouble *, BLASLONG,
  250. xdouble *, BLASLONG, void *)) func;
  251. afunc(args -> m, args -> n, args -> k,
  252. ((xdouble *)args -> alpha)[0],
  253. ((xdouble *)args -> alpha)[1],
  254. args -> a, args -> lda,
  255. args -> b, args -> ldb,
  256. args -> c, args -> ldc, sb);
  257. } else
  258. #endif
  259. if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
  260. /* COMPLEX / Double */
  261. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
  262. double *, BLASLONG, double *, BLASLONG,
  263. double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double,
  264. double *, BLASLONG, double *, BLASLONG,
  265. double *, BLASLONG, void *)) func;
  266. afunc(args -> m, args -> n, args -> k,
  267. ((double *)args -> alpha)[0],
  268. ((double *)args -> alpha)[1],
  269. args -> a, args -> lda,
  270. args -> b, args -> ldb,
  271. args -> c, args -> ldc, sb);
  272. } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
  273. /* COMPLEX / Single */
  274. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
  275. float *, BLASLONG, float *, BLASLONG,
  276. float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float,
  277. float *, BLASLONG, float *, BLASLONG,
  278. float *, BLASLONG, void *)) func;
  279. afunc(args -> m, args -> n, args -> k,
  280. ((float *)args -> alpha)[0],
  281. ((float *)args -> alpha)[1],
  282. args -> a, args -> lda,
  283. args -> b, args -> ldb,
  284. args -> c, args -> ldc, sb);
  285. } else {
  286. /* COMPLEX / Other types in future */
  287. }
  288. }
  289. }
  290. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  291. int gotoblas_set_affinity(int);
  292. int gotoblas_set_affinity2(int);
  293. int get_node(void);
  294. #endif
  295. static int increased_threads = 0;
  296. #ifdef OS_LINUX
  297. extern int openblas_get_num_threads(void);
  298. int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
  299. const int active_threads = openblas_get_num_threads();
  300. if (thread_idx < 0 || thread_idx >= active_threads) {
  301. errno = EINVAL;
  302. return -1;
  303. }
  304. pthread_t thread = (thread_idx == active_threads - 1)
  305. ? pthread_self()
  306. : blas_threads[thread_idx];
  307. return pthread_setaffinity_np(thread, cpusetsize, cpu_set);
  308. }
  309. int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
  310. const int active_threads = openblas_get_num_threads();
  311. if (thread_idx < 0 || thread_idx >= active_threads) {
  312. errno = EINVAL;
  313. return -1;
  314. }
  315. pthread_t thread = (thread_idx == active_threads - 1)
  316. ? pthread_self()
  317. : blas_threads[thread_idx];
  318. return pthread_getaffinity_np(thread, cpusetsize, cpu_set);
  319. }
  320. #endif
  321. static void* blas_thread_server(void *arg){
  322. /* Thread identifier */
  323. BLASLONG cpu = (BLASLONG)arg;
  324. unsigned int last_tick;
  325. blas_queue_t *queue;
  326. blas_queue_t *tscq;
  327. #ifdef TIMING_DEBUG
  328. unsigned long start, stop;
  329. #endif
  330. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  331. if (!increased_threads)
  332. thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
  333. else
  334. thread_status[cpu].node = gotoblas_set_affinity(-1);
  335. #endif
  336. #ifdef MONITOR
  337. main_status[cpu] = MAIN_ENTER;
  338. #endif
  339. #ifdef SMP_DEBUG
  340. fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
  341. #endif
  342. while (1){
  343. #ifdef MONITOR
  344. main_status[cpu] = MAIN_QUEUING;
  345. #endif
  346. #ifdef TIMING
  347. exit_time[cpu] = rpcc();
  348. #endif
  349. last_tick = (unsigned int)rpcc();
  350. tscq = atomic_load_queue(&thread_status[cpu].queue);
  351. while(!tscq) {
  352. YIELDING;
  353. if ((unsigned int)rpcc() - last_tick > thread_timeout) {
  354. if (!atomic_load_queue(&thread_status[cpu].queue)) {
  355. pthread_mutex_lock (&thread_status[cpu].lock);
  356. thread_status[cpu].status = THREAD_STATUS_SLEEP;
  357. while (thread_status[cpu].status == THREAD_STATUS_SLEEP &&
  358. !atomic_load_queue(&thread_status[cpu].queue)) {
  359. #ifdef MONITOR
  360. main_status[cpu] = MAIN_SLEEPING;
  361. #endif
  362. pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
  363. }
  364. pthread_mutex_unlock(&thread_status[cpu].lock);
  365. }
  366. last_tick = (unsigned int)rpcc();
  367. }
  368. tscq = atomic_load_queue(&thread_status[cpu].queue);
  369. }
  370. queue = atomic_load_queue(&thread_status[cpu].queue);
  371. MB;
  372. if ((long)queue == -1) break;
  373. #ifdef MONITOR
  374. main_status[cpu] = MAIN_RECEIVING;
  375. #endif
  376. #ifdef TIMING_DEBUG
  377. start = rpcc();
  378. #endif
  379. if(queue) {
  380. exec_threads(cpu, queue, 0);
  381. }
  382. #ifdef MONITOR
  383. main_status[cpu] = MAIN_DONE;
  384. #endif
  385. #ifdef TIMING_DEBUG
  386. stop = rpcc();
  387. fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
  388. start, stop,
  389. stop - start);
  390. #endif
  391. }
  392. /* Shutdown procedure */
  393. #ifdef SMP_DEBUG
  394. fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
  395. #endif
  396. //pthread_exit(NULL);
  397. return NULL;
  398. }
  399. #ifdef MONITOR
  400. static BLASLONG num_suspend = 0;
  401. static int blas_monitor(void *arg){
  402. int i;
  403. while(1){
  404. for (i = 0; i < blas_num_threads - 1; i++){
  405. switch (main_status[i]) {
  406. case MAIN_ENTER :
  407. fprintf(STDERR, "THREAD[%2d] : Entering.\n", i);
  408. break;
  409. case MAIN_EXIT :
  410. fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i);
  411. break;
  412. case MAIN_TRYLOCK :
  413. fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i);
  414. break;
  415. case MAIN_QUEUING :
  416. fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i);
  417. break;
  418. case MAIN_RECEIVING :
  419. fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i);
  420. break;
  421. case MAIN_RUNNING1 :
  422. fprintf(STDERR, "THREAD[%2d] : Running1.\n", i);
  423. break;
  424. case MAIN_RUNNING2 :
  425. fprintf(STDERR, "THREAD[%2d] : Running2.\n", i);
  426. break;
  427. case MAIN_RUNNING3 :
  428. fprintf(STDERR, "THREAD[%2d] : Running3.\n", i);
  429. break;
  430. case MAIN_WAITING :
  431. fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i);
  432. break;
  433. case MAIN_SLEEPING :
  434. fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i);
  435. break;
  436. case MAIN_FINISH :
  437. fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i);
  438. break;
  439. case MAIN_DONE :
  440. fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i);
  441. break;
  442. }
  443. fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend);
  444. }
  445. sleep(1);
  446. }
  447. return 0;
  448. }
  449. #endif
  450. /* Initializing routine */
  451. int blas_thread_init(void){
  452. BLASLONG i;
  453. int ret;
  454. int thread_timeout_env;
  455. #ifdef NEED_STACKATTR
  456. pthread_attr_t attr;
  457. #endif
  458. if (blas_server_avail) return 0;
  459. #ifdef NEED_STACKATTR
  460. pthread_attr_init(&attr);
  461. pthread_attr_setguardsize(&attr, 0x1000U);
  462. pthread_attr_setstacksize( &attr, 0x1000U);
  463. #endif
  464. LOCK_COMMAND(&server_lock);
  465. // Adjust thread buffers
  466. adjust_thread_buffers();
  467. if (!blas_server_avail){
  468. thread_timeout_env=openblas_thread_timeout();
  469. if (thread_timeout_env>0) {
  470. if (thread_timeout_env < 4) thread_timeout_env = 4;
  471. if (thread_timeout_env > 30) thread_timeout_env = 30;
  472. thread_timeout = (1 << thread_timeout_env);
  473. }
  474. for(i = 0; i < blas_num_threads - 1; i++){
  475. atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
  476. thread_status[i].status = THREAD_STATUS_WAKEUP;
  477. pthread_mutex_init(&thread_status[i].lock, NULL);
  478. pthread_cond_init (&thread_status[i].wakeup, NULL);
  479. #ifdef NEED_STACKATTR
  480. ret=pthread_create(&blas_threads[i], &attr,
  481. &blas_thread_server, (void *)i);
  482. #else
  483. ret=pthread_create(&blas_threads[i], NULL,
  484. &blas_thread_server, (void *)i);
  485. #endif
  486. if(ret!=0){
  487. struct rlimit rlim;
  488. const char *msg = strerror(ret);
  489. fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg);
  490. #ifdef RLIMIT_NPROC
  491. if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
  492. fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
  493. "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
  494. }
  495. #endif
  496. if(0 != raise(SIGINT)) {
  497. fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
  498. exit(EXIT_FAILURE);
  499. }
  500. }
  501. }
  502. #ifdef MONITOR
  503. pthread_create(&monitor_thread, NULL,
  504. (void *)&blas_monitor, (void *)NULL);
  505. #endif
  506. blas_server_avail = 1;
  507. }
  508. UNLOCK_COMMAND(&server_lock);
  509. return 0;
  510. }
  511. /*
  512. User can call one of two routines.
  513. exec_blas_async ... immediately returns after jobs are queued.
  514. exec_blas ... returns after jobs are finished.
  515. */
  516. static BLASULONG exec_queue_lock = 0;
  517. int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
  518. #ifdef SMP_SERVER
  519. // Handle lazy re-init of the thread-pool after a POSIX fork
  520. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  521. #endif
  522. BLASLONG i = 0;
  523. blas_queue_t *current = queue;
  524. blas_queue_t *tsiq,*tspq;
  525. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  526. int node = get_node();
  527. int nodes = get_num_nodes();
  528. #endif
  529. #ifdef SMP_DEBUG
  530. int exec_count = 0;
  531. fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
  532. #endif
  533. blas_lock(&exec_queue_lock);
  534. while (queue) {
  535. queue -> position = pos;
  536. #ifdef CONSISTENT_FPCSR
  537. #ifdef __aarch64__
  538. __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode));
  539. #else
  540. __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
  541. __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
  542. #endif
  543. #endif
  544. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  545. /* Node Mapping Mode */
  546. if (queue -> mode & BLAS_NODE) {
  547. do {
  548. while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++;
  549. if (i < blas_num_threads - 1) break;
  550. i ++;
  551. if (i >= blas_num_threads - 1) {
  552. i = 0;
  553. node ++;
  554. if (node >= nodes) node = 0;
  555. }
  556. } while (1);
  557. } else {
  558. tsiq = atomic_load_queue(&thread_status[i].queue);
  559. while(tsiq) {
  560. i ++;
  561. if (i >= blas_num_threads - 1) i = 0;
  562. tsiq = atomic_load_queue(&thread_status[i].queue);
  563. }
  564. }
  565. #else
  566. tsiq = atomic_load_queue(&thread_status[i].queue);
  567. while(tsiq) {
  568. i ++;
  569. if (i >= blas_num_threads - 1) i = 0;
  570. tsiq = atomic_load_queue(&thread_status[i].queue);
  571. }
  572. #endif
  573. queue -> assigned = i;
  574. MB;
  575. atomic_store_queue(&thread_status[i].queue, queue);
  576. queue = queue -> next;
  577. pos ++;
  578. #ifdef SMP_DEBUG
  579. exec_count ++;
  580. #endif
  581. }
  582. blas_unlock(&exec_queue_lock);
  583. #ifdef SMP_DEBUG
  584. fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
  585. #endif
  586. while (current) {
  587. pos = current -> assigned;
  588. tspq = atomic_load_queue(&thread_status[pos].queue);
  589. if ((BLASULONG)tspq > 1) {
  590. pthread_mutex_lock (&thread_status[pos].lock);
  591. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  592. #ifdef MONITOR
  593. num_suspend ++;
  594. #endif
  595. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  596. thread_status[pos].status = THREAD_STATUS_WAKEUP;
  597. pthread_cond_signal(&thread_status[pos].wakeup);
  598. }
  599. }
  600. pthread_mutex_unlock(&thread_status[pos].lock);
  601. }
  602. current = current -> next;
  603. }
  604. return 0;
  605. }
  606. int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
  607. blas_queue_t * tsqq;
  608. while ((num > 0) && queue) {
  609. tsqq = atomic_load_queue(&thread_status[queue->assigned].queue);
  610. while(tsqq) {
  611. YIELDING;
  612. tsqq = atomic_load_queue(&thread_status[queue->assigned].queue);
  613. };
  614. queue = queue -> next;
  615. num --;
  616. }
  617. MB;
  618. #ifdef SMP_DEBUG
  619. fprintf(STDERR, "Done.\n\n");
  620. #endif
  621. return 0;
  622. }
  623. /* Execute Threads */
  624. int exec_blas(BLASLONG num, blas_queue_t *queue){
  625. #ifdef SMP_SERVER
  626. // Handle lazy re-init of the thread-pool after a POSIX fork
  627. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  628. #endif
  629. int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
  630. #ifdef TIMING_DEBUG
  631. BLASULONG start, stop;
  632. #endif
  633. if ((num <= 0) || (queue == NULL)) return 0;
  634. #ifdef SMP_DEBUG
  635. fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
  636. #endif
  637. //Redirect to caller's callback routine
  638. if (openblas_threads_callback_) {
  639. int buf_index = 0, i = 0;
  640. #ifndef USE_SIMPLE_THREADED_LEVEL3
  641. for (i = 0; i < num; i ++)
  642. queue[i].position = i;
  643. #endif
  644. openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index);
  645. return 0;
  646. }
  647. #ifdef __ELF__
  648. if (omp_in_parallel && (num > 1)) {
  649. if (omp_in_parallel() > 0) {
  650. fprintf(stderr,
  651. "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
  652. "Please rebuild the library with USE_OPENMP=1 option.\n");
  653. }
  654. }
  655. #endif
  656. if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
  657. #ifdef TIMING_DEBUG
  658. start = rpcc();
  659. fprintf(STDERR, "\n");
  660. #endif
  661. routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine;
  662. if (queue -> mode & BLAS_LEGACY) {
  663. legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
  664. } else
  665. if (queue -> mode & BLAS_PTHREAD) {
  666. void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine;
  667. (pthreadcompat)(queue -> args);
  668. } else
  669. (routine)(queue -> args, queue -> range_m, queue -> range_n,
  670. queue -> sa, queue -> sb, 0);
  671. #ifdef TIMING_DEBUG
  672. stop = rpcc();
  673. #endif
  674. if ((num > 1) && queue -> next) {
  675. exec_blas_async_wait(num - 1, queue -> next);
  676. // arm: make sure results from other threads are visible
  677. MB;
  678. }
  679. #ifdef TIMING_DEBUG
  680. fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
  681. start, stop,
  682. stop - start);
  683. #endif
  684. return 0;
  685. }
  686. void goto_set_num_threads(int num_threads) {
  687. long i;
  688. #ifdef SMP_SERVER
  689. // Handle lazy re-init of the thread-pool after a POSIX fork
  690. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  691. #endif
  692. if (num_threads < 1) num_threads = blas_num_threads;
  693. #ifndef NO_AFFINITY
  694. if (num_threads == 1) {
  695. if (blas_cpu_number == 1){
  696. //OpenBLAS is already single thread.
  697. return;
  698. }else{
  699. //From multi-threads to single thread
  700. //Restore the original affinity mask
  701. gotoblas_set_affinity(-1);
  702. }
  703. }
  704. #endif
  705. if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
  706. if (num_threads > blas_num_threads) {
  707. LOCK_COMMAND(&server_lock);
  708. increased_threads = 1;
  709. for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
  710. atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
  711. thread_status[i].status = THREAD_STATUS_WAKEUP;
  712. pthread_mutex_init(&thread_status[i].lock, NULL);
  713. pthread_cond_init (&thread_status[i].wakeup, NULL);
  714. #ifdef NEED_STACKATTR
  715. pthread_create(&blas_threads[i], &attr,
  716. &blas_thread_server, (void *)i);
  717. #else
  718. pthread_create(&blas_threads[i], NULL,
  719. &blas_thread_server, (void *)i);
  720. #endif
  721. }
  722. blas_num_threads = num_threads;
  723. UNLOCK_COMMAND(&server_lock);
  724. }
  725. #ifndef NO_AFFINITY
  726. if(blas_cpu_number == 1 && num_threads > 1){
  727. //Restore the thread 0 affinity.
  728. gotoblas_set_affinity(0);
  729. }
  730. #endif
  731. blas_cpu_number = num_threads;
  732. #if defined(ARCH_MIPS64)
  733. #ifndef DYNAMIC_ARCH
  734. //set parameters for different number of threads.
  735. blas_set_parameter();
  736. #endif
  737. #endif
  738. }
  739. void openblas_set_num_threads(int num_threads) {
  740. goto_set_num_threads(num_threads);
  741. }
  742. /* Compatible function with pthread_create / join */
  743. int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
  744. blas_queue_t queue[MAX_CPU_NUMBER];
  745. int i;
  746. if (numthreads <= 0) return 0;
  747. #ifdef SMP
  748. if (blas_cpu_number == 0) blas_get_cpu_number();
  749. #ifdef SMP_SERVER
  750. if (blas_server_avail == 0) blas_thread_init();
  751. #endif
  752. #endif
  753. for (i = 0; i < numthreads; i ++) {
  754. queue[i].mode = BLAS_PTHREAD;
  755. queue[i].routine = function;
  756. queue[i].args = args;
  757. queue[i].range_m = NULL;
  758. queue[i].range_n = NULL;
  759. queue[i].sa = args;
  760. queue[i].sb = args;
  761. queue[i].next = &queue[i + 1];
  762. args += stride;
  763. }
  764. queue[numthreads - 1].next = NULL;
  765. exec_blas(numthreads, queue);
  766. return 0;
  767. }
  768. /* Shutdown procedure, but user don't have to call this routine. The */
  769. /* kernel automatically kill threads. */
  770. int BLASFUNC(blas_thread_shutdown)(void){
  771. int i;
  772. LOCK_COMMAND(&server_lock);
  773. //Free buffers allocated for threads
  774. for(i=0; i<MAX_CPU_NUMBER; i++){
  775. if(blas_thread_buffer[i]!=NULL){
  776. blas_memory_free(blas_thread_buffer[i]);
  777. blas_thread_buffer[i]=NULL;
  778. }
  779. }
  780. if (blas_server_avail) {
  781. for (i = 0; i < blas_num_threads - 1; i++) {
  782. pthread_mutex_lock (&thread_status[i].lock);
  783. atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
  784. thread_status[i].status = THREAD_STATUS_WAKEUP;
  785. pthread_cond_signal (&thread_status[i].wakeup);
  786. pthread_mutex_unlock(&thread_status[i].lock);
  787. }
  788. for(i = 0; i < blas_num_threads - 1; i++){
  789. pthread_join(blas_threads[i], NULL);
  790. }
  791. for(i = 0; i < blas_num_threads - 1; i++){
  792. pthread_mutex_destroy(&thread_status[i].lock);
  793. pthread_cond_destroy (&thread_status[i].wakeup);
  794. }
  795. #ifdef NEED_STACKATTR
  796. pthread_attr_destroy(&attr);
  797. #endif
  798. blas_server_avail = 0;
  799. }
  800. UNLOCK_COMMAND(&server_lock);
  801. return 0;
  802. }
  803. static void adjust_thread_buffers() {
  804. int i=0;
  805. //adjust buffer for each thread
  806. for(i=0; i < blas_cpu_number; i++){
  807. if(blas_thread_buffer[i] == NULL){
  808. blas_thread_buffer[i] = blas_memory_alloc(2);
  809. }
  810. }
  811. for(; i < MAX_CPU_NUMBER; i++){
  812. if(blas_thread_buffer[i] != NULL){
  813. blas_memory_free(blas_thread_buffer[i]);
  814. blas_thread_buffer[i] = NULL;
  815. }
  816. }
  817. }
  818. static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) {
  819. int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
  820. atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
  821. void *buffer = blas_thread_buffer[cpu];
  822. void *sa = queue -> sa;
  823. void *sb = queue -> sb;
  824. #ifdef SMP_DEBUG
  825. if (queue -> args) {
  826. fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
  827. cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
  828. }
  829. #endif
  830. #ifdef CONSISTENT_FPCSR
  831. #ifdef __aarch64__
  832. __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
  833. #else
  834. __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
  835. __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
  836. #endif
  837. #endif
  838. #ifdef MONITOR
  839. main_status[cpu] = MAIN_RUNNING1;
  840. #endif
  841. if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
  842. if (sb == NULL) {
  843. if (!(queue -> mode & BLAS_COMPLEX)){
  844. #ifdef EXPRECISION
  845. if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
  846. sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
  847. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  848. } else
  849. #endif
  850. if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
  851. #ifdef BUILD_DOUBLE
  852. sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
  853. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  854. #endif
  855. } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
  856. #ifdef BUILD_SINGLE
  857. sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
  858. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  859. #endif
  860. } else {
  861. /* Other types in future */
  862. }
  863. } else {
  864. #ifdef EXPRECISION
  865. if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
  866. sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
  867. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  868. } else
  869. #endif
  870. if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
  871. #ifdef BUILD_COMPLEX16
  872. sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
  873. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  874. #endif
  875. } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
  876. #ifdef BUILD_COMPLEX
  877. sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
  878. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  879. #endif
  880. } else {
  881. /* Other types in future */
  882. }
  883. }
  884. queue->sb=sb;
  885. }
  886. #ifdef MONITOR
  887. main_status[cpu] = MAIN_RUNNING2;
  888. #endif
  889. if (queue -> mode & BLAS_LEGACY) {
  890. legacy_exec(routine, queue -> mode, queue -> args, sb);
  891. } else
  892. if (queue -> mode & BLAS_PTHREAD) {
  893. void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
  894. (pthreadcompat)(queue -> args);
  895. } else
  896. (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
  897. #ifdef SMP_DEBUG
  898. fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
  899. #endif
  900. #ifdef MONITOR
  901. main_status[cpu] = MAIN_FINISH;
  902. #endif
  903. // arm: make sure all results are written out _before_
  904. // thread is marked as done and other threads use them
  905. MB;
  906. atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0);
  907. }
  908. #endif