You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

blas_server.c 27 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #include "common.h"
  66. #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
  67. #include <dlfcn.h>
  68. #include <signal.h>
  69. #include <sys/resource.h>
  70. #include <sys/time.h>
  71. #endif
  72. #ifndef likely
  73. #ifdef __GNUC__
  74. #define likely(x) __builtin_expect(!!(x), 1)
  75. #else
  76. #define likely(x) (x)
  77. #endif
  78. #endif
  79. #ifndef unlikely
  80. #ifdef __GNUC__
  81. #define unlikely(x) __builtin_expect(!!(x), 0)
  82. #else
  83. #define unlikely(x) (x)
  84. #endif
  85. #endif
  86. extern unsigned int openblas_thread_timeout();
  87. #ifdef SMP_SERVER
  88. #undef MONITOR
  89. #undef TIMING
  90. #undef TIMING_DEBUG
  91. #undef NEED_STACKATTR
  92. #define ATTRIBUTE_SIZE 128
  93. /* This is a thread server model implementation. The threads are */
  94. /* spawned at first access to blas library, and still remains until */
  95. /* destruction routine is called. The number of threads are */
  96. /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
  97. /* jobs is queued. */
  98. /* We need this global for checking if initialization is finished. */
  99. int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
  100. /* Local Variables */
  101. #if defined(USE_PTHREAD_LOCK)
  102. static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
  103. #elif defined(USE_PTHREAD_SPINLOCK)
  104. static pthread_spinlock_t server_lock = 0;
  105. #else
  106. static unsigned long server_lock = 0;
  107. #endif
  108. #define THREAD_STATUS_SLEEP 2
  109. #define THREAD_STATUS_WAKEUP 4
  110. static pthread_t blas_threads [MAX_CPU_NUMBER];
  111. typedef struct {
  112. blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE)));
  113. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  114. int node;
  115. #endif
  116. volatile long status;
  117. pthread_mutex_t lock;
  118. pthread_cond_t wakeup;
  119. } thread_status_t;
  120. #if (__STDC_VERSION__ >= 201112L)
  121. #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED)
  122. #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
  123. #else
  124. #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p))
  125. #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v))
  126. #endif
  127. static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
  128. #ifndef THREAD_TIMEOUT
  129. #define THREAD_TIMEOUT 28
  130. #endif
  131. static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
  132. #ifdef MONITOR
  133. /* Monitor is a function to see thread's status for every second. */
  134. /* Usually it turns off and it's for debugging. */
  135. static pthread_t monitor_thread;
  136. static int main_status[MAX_CPU_NUMBER];
  137. #define MAIN_ENTER 0x01
  138. #define MAIN_EXIT 0x02
  139. #define MAIN_TRYLOCK 0x03
  140. #define MAIN_LOCKSUCCESS 0x04
  141. #define MAIN_QUEUING 0x05
  142. #define MAIN_RECEIVING 0x06
  143. #define MAIN_RUNNING1 0x07
  144. #define MAIN_RUNNING2 0x08
  145. #define MAIN_RUNNING3 0x09
  146. #define MAIN_WAITING 0x0a
  147. #define MAIN_SLEEPING 0x0b
  148. #define MAIN_FINISH 0x0c
  149. #define MAIN_DONE 0x0d
  150. #endif
  151. #define BLAS_QUEUE_FINISHED 3
  152. #define BLAS_QUEUE_RUNNING 4
  153. #ifdef TIMING
  154. BLASLONG exit_time[MAX_CPU_NUMBER];
  155. #endif
  156. static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
  157. if (!(mode & BLAS_COMPLEX)){
  158. #ifdef EXPRECISION
  159. if (mode & BLAS_XDOUBLE){
  160. /* REAL / Extended Double */
  161. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
  162. xdouble *, BLASLONG, xdouble *, BLASLONG,
  163. xdouble *, BLASLONG, void *) = func;
  164. afunc(args -> m, args -> n, args -> k,
  165. ((xdouble *)args -> alpha)[0],
  166. args -> a, args -> lda,
  167. args -> b, args -> ldb,
  168. args -> c, args -> ldc, sb);
  169. } else
  170. #endif
  171. if (mode & BLAS_DOUBLE){
  172. /* REAL / Double */
  173. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
  174. double *, BLASLONG, double *, BLASLONG,
  175. double *, BLASLONG, void *) = func;
  176. afunc(args -> m, args -> n, args -> k,
  177. ((double *)args -> alpha)[0],
  178. args -> a, args -> lda,
  179. args -> b, args -> ldb,
  180. args -> c, args -> ldc, sb);
  181. } else {
  182. /* REAL / Single */
  183. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
  184. float *, BLASLONG, float *, BLASLONG,
  185. float *, BLASLONG, void *) = func;
  186. afunc(args -> m, args -> n, args -> k,
  187. ((float *)args -> alpha)[0],
  188. args -> a, args -> lda,
  189. args -> b, args -> ldb,
  190. args -> c, args -> ldc, sb);
  191. }
  192. } else {
  193. #ifdef EXPRECISION
  194. if (mode & BLAS_XDOUBLE){
  195. /* COMPLEX / Extended Double */
  196. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
  197. xdouble *, BLASLONG, xdouble *, BLASLONG,
  198. xdouble *, BLASLONG, void *) = func;
  199. afunc(args -> m, args -> n, args -> k,
  200. ((xdouble *)args -> alpha)[0],
  201. ((xdouble *)args -> alpha)[1],
  202. args -> a, args -> lda,
  203. args -> b, args -> ldb,
  204. args -> c, args -> ldc, sb);
  205. } else
  206. #endif
  207. if (mode & BLAS_DOUBLE){
  208. /* COMPLEX / Double */
  209. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
  210. double *, BLASLONG, double *, BLASLONG,
  211. double *, BLASLONG, void *) = func;
  212. afunc(args -> m, args -> n, args -> k,
  213. ((double *)args -> alpha)[0],
  214. ((double *)args -> alpha)[1],
  215. args -> a, args -> lda,
  216. args -> b, args -> ldb,
  217. args -> c, args -> ldc, sb);
  218. } else {
  219. /* COMPLEX / Single */
  220. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
  221. float *, BLASLONG, float *, BLASLONG,
  222. float *, BLASLONG, void *) = func;
  223. afunc(args -> m, args -> n, args -> k,
  224. ((float *)args -> alpha)[0],
  225. ((float *)args -> alpha)[1],
  226. args -> a, args -> lda,
  227. args -> b, args -> ldb,
  228. args -> c, args -> ldc, sb);
  229. }
  230. }
  231. }
  232. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  233. int gotoblas_set_affinity(int);
  234. int gotoblas_set_affinity2(int);
  235. int get_node(void);
  236. #endif
  237. static int increased_threads = 0;
  238. static void* blas_thread_server(void *arg){
  239. /* Thread identifier */
  240. BLASLONG cpu = (BLASLONG)arg;
  241. unsigned int last_tick;
  242. void *buffer, *sa, *sb;
  243. blas_queue_t *queue;
  244. blas_queue_t *tscq;
  245. #ifdef TIMING_DEBUG
  246. unsigned long start, stop;
  247. #endif
  248. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  249. if (!increased_threads)
  250. thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
  251. else
  252. thread_status[cpu].node = gotoblas_set_affinity(-1);
  253. #endif
  254. #ifdef MONITOR
  255. main_status[cpu] = MAIN_ENTER;
  256. #endif
  257. buffer = blas_memory_alloc(2);
  258. #ifdef SMP_DEBUG
  259. fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
  260. #endif
  261. while (1){
  262. #ifdef MONITOR
  263. main_status[cpu] = MAIN_QUEUING;
  264. #endif
  265. #ifdef TIMING
  266. exit_time[cpu] = rpcc();
  267. #endif
  268. last_tick = (unsigned int)rpcc();
  269. tscq = atomic_load_queue(&thread_status[cpu].queue);
  270. while(!tscq) {
  271. YIELDING;
  272. if ((unsigned int)rpcc() - last_tick > thread_timeout) {
  273. if (!atomic_load_queue(&thread_status[cpu].queue)) {
  274. pthread_mutex_lock (&thread_status[cpu].lock);
  275. thread_status[cpu].status = THREAD_STATUS_SLEEP;
  276. while (thread_status[cpu].status == THREAD_STATUS_SLEEP &&
  277. !atomic_load_queue(&thread_status[cpu].queue)) {
  278. #ifdef MONITOR
  279. main_status[cpu] = MAIN_SLEEPING;
  280. #endif
  281. pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
  282. }
  283. pthread_mutex_unlock(&thread_status[cpu].lock);
  284. }
  285. last_tick = (unsigned int)rpcc();
  286. }
  287. tscq = atomic_load_queue(&thread_status[cpu].queue);
  288. }
  289. queue = atomic_load_queue(&thread_status[cpu].queue);
  290. MB;
  291. if ((long)queue == -1) break;
  292. #ifdef MONITOR
  293. main_status[cpu] = MAIN_RECEIVING;
  294. #endif
  295. #ifdef TIMING_DEBUG
  296. start = rpcc();
  297. #endif
  298. if (queue) {
  299. int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
  300. atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
  301. sa = queue -> sa;
  302. sb = queue -> sb;
  303. #ifdef SMP_DEBUG
  304. if (queue -> args) {
  305. fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
  306. cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
  307. }
  308. #endif
  309. #ifdef CONSISTENT_FPCSR
  310. __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
  311. __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
  312. #endif
  313. #ifdef MONITOR
  314. main_status[cpu] = MAIN_RUNNING1;
  315. #endif
  316. if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
  317. if (sb == NULL) {
  318. if (!(queue -> mode & BLAS_COMPLEX)){
  319. #ifdef EXPRECISION
  320. if (queue -> mode & BLAS_XDOUBLE){
  321. sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
  322. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  323. } else
  324. #endif
  325. if (queue -> mode & BLAS_DOUBLE){
  326. sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
  327. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  328. } else {
  329. sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
  330. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  331. }
  332. } else {
  333. #ifdef EXPRECISION
  334. if (queue -> mode & BLAS_XDOUBLE){
  335. sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
  336. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  337. } else
  338. #endif
  339. if (queue -> mode & BLAS_DOUBLE){
  340. sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
  341. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  342. } else {
  343. sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
  344. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  345. }
  346. }
  347. queue->sb=sb;
  348. }
  349. #ifdef MONITOR
  350. main_status[cpu] = MAIN_RUNNING2;
  351. #endif
  352. if (queue -> mode & BLAS_LEGACY) {
  353. legacy_exec(routine, queue -> mode, queue -> args, sb);
  354. } else
  355. if (queue -> mode & BLAS_PTHREAD) {
  356. void (*pthreadcompat)(void *) = queue -> routine;
  357. (pthreadcompat)(queue -> args);
  358. } else
  359. (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
  360. #ifdef SMP_DEBUG
  361. fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
  362. #endif
  363. #ifdef MONITOR
  364. main_status[cpu] = MAIN_FINISH;
  365. #endif
  366. // arm: make sure all results are written out _before_
  367. // thread is marked as done and other threads use them
  368. MB;
  369. atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)0);
  370. }
  371. #ifdef MONITOR
  372. main_status[cpu] = MAIN_DONE;
  373. #endif
  374. #ifdef TIMING_DEBUG
  375. stop = rpcc();
  376. fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
  377. start, stop,
  378. stop - start);
  379. #endif
  380. }
  381. /* Shutdown procedure */
  382. #ifdef SMP_DEBUG
  383. fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
  384. #endif
  385. blas_memory_free(buffer);
  386. //pthread_exit(NULL);
  387. return NULL;
  388. }
  389. #ifdef MONITOR
  390. static BLASLONG num_suspend = 0;
  391. static int blas_monitor(void *arg){
  392. int i;
  393. while(1){
  394. for (i = 0; i < blas_num_threads - 1; i++){
  395. switch (main_status[i]) {
  396. case MAIN_ENTER :
  397. fprintf(STDERR, "THREAD[%2d] : Entering.\n", i);
  398. break;
  399. case MAIN_EXIT :
  400. fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i);
  401. break;
  402. case MAIN_TRYLOCK :
  403. fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i);
  404. break;
  405. case MAIN_QUEUING :
  406. fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i);
  407. break;
  408. case MAIN_RECEIVING :
  409. fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i);
  410. break;
  411. case MAIN_RUNNING1 :
  412. fprintf(STDERR, "THREAD[%2d] : Running1.\n", i);
  413. break;
  414. case MAIN_RUNNING2 :
  415. fprintf(STDERR, "THREAD[%2d] : Running2.\n", i);
  416. break;
  417. case MAIN_RUNNING3 :
  418. fprintf(STDERR, "THREAD[%2d] : Running3.\n", i);
  419. break;
  420. case MAIN_WAITING :
  421. fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i);
  422. break;
  423. case MAIN_SLEEPING :
  424. fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i);
  425. break;
  426. case MAIN_FINISH :
  427. fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i);
  428. break;
  429. case MAIN_DONE :
  430. fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i);
  431. break;
  432. }
  433. fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend);
  434. }
  435. sleep(1);
  436. }
  437. return 0;
  438. }
  439. #endif
  440. /* Initializing routine */
  441. int blas_thread_init(void){
  442. BLASLONG i;
  443. int ret;
  444. int thread_timeout_env;
  445. #ifdef NEED_STACKATTR
  446. pthread_attr_t attr;
  447. #endif
  448. if (blas_server_avail) return 0;
  449. #ifdef NEED_STACKATTR
  450. pthread_attr_init(&attr);
  451. pthread_attr_setguardsize(&attr, 0x1000U);
  452. pthread_attr_setstacksize( &attr, 0x1000U);
  453. #endif
  454. LOCK_COMMAND(&server_lock);
  455. if (!blas_server_avail){
  456. thread_timeout_env=openblas_thread_timeout();
  457. if (thread_timeout_env>0) {
  458. if (thread_timeout_env < 4) thread_timeout_env = 4;
  459. if (thread_timeout_env > 30) thread_timeout_env = 30;
  460. thread_timeout = (1 << thread_timeout_env);
  461. }
  462. for(i = 0; i < blas_num_threads - 1; i++){
  463. atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
  464. thread_status[i].status = THREAD_STATUS_WAKEUP;
  465. pthread_mutex_init(&thread_status[i].lock, NULL);
  466. pthread_cond_init (&thread_status[i].wakeup, NULL);
  467. #ifdef NEED_STACKATTR
  468. ret=pthread_create(&blas_threads[i], &attr,
  469. &blas_thread_server, (void *)i);
  470. #else
  471. ret=pthread_create(&blas_threads[i], NULL,
  472. &blas_thread_server, (void *)i);
  473. #endif
  474. if(ret!=0){
  475. struct rlimit rlim;
  476. const char *msg = strerror(ret);
  477. fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
  478. #ifdef RLIMIT_NPROC
  479. if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
  480. fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
  481. "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
  482. }
  483. #endif
  484. if(0 != raise(SIGINT)) {
  485. fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
  486. exit(EXIT_FAILURE);
  487. }
  488. }
  489. }
  490. #ifdef MONITOR
  491. pthread_create(&monitor_thread, NULL,
  492. (void *)&blas_monitor, (void *)NULL);
  493. #endif
  494. blas_server_avail = 1;
  495. }
  496. UNLOCK_COMMAND(&server_lock);
  497. return 0;
  498. }
  499. /*
  500. User can call one of two routines.
  501. exec_blas_async ... immediately returns after jobs are queued.
  502. exec_blas ... returns after jobs are finished.
  503. */
  504. static BLASULONG exec_queue_lock = 0;
  505. int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
  506. #ifdef SMP_SERVER
  507. // Handle lazy re-init of the thread-pool after a POSIX fork
  508. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  509. #endif
  510. BLASLONG i = 0;
  511. blas_queue_t *current = queue;
  512. blas_queue_t *tsiq,*tspq;
  513. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  514. int node = get_node();
  515. int nodes = get_num_nodes();
  516. #endif
  517. #ifdef SMP_DEBUG
  518. int exec_count = 0;
  519. fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
  520. #endif
  521. blas_lock(&exec_queue_lock);
  522. while (queue) {
  523. queue -> position = pos;
  524. #ifdef CONSISTENT_FPCSR
  525. __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
  526. __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
  527. #endif
  528. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  529. /* Node Mapping Mode */
  530. if (queue -> mode & BLAS_NODE) {
  531. do {
  532. while((thread_status[i].node != node || atomic_load_queue(&thread_status[i].queue)) && (i < blas_num_threads - 1)) i ++;
  533. if (i < blas_num_threads - 1) break;
  534. i ++;
  535. if (i >= blas_num_threads - 1) {
  536. i = 0;
  537. node ++;
  538. if (node >= nodes) node = 0;
  539. }
  540. } while (1);
  541. } else {
  542. tsiq = atomic_load_queue(&thread_status[i].queue);
  543. while(tsiq) {
  544. i ++;
  545. if (i >= blas_num_threads - 1) i = 0;
  546. tsiq = atomic_load_queue(&thread_status[i].queue);
  547. }
  548. }
  549. #else
  550. tsiq = atomic_load_queue(&thread_status[i].queue);
  551. while(tsiq) {
  552. i ++;
  553. if (i >= blas_num_threads - 1) i = 0;
  554. tsiq = atomic_load_queue(&thread_status[i].queue);
  555. }
  556. #endif
  557. queue -> assigned = i;
  558. MB;
  559. atomic_store_queue(&thread_status[i].queue, queue);
  560. queue = queue -> next;
  561. pos ++;
  562. #ifdef SMP_DEBUG
  563. exec_count ++;
  564. #endif
  565. }
  566. blas_unlock(&exec_queue_lock);
  567. #ifdef SMP_DEBUG
  568. fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
  569. #endif
  570. while (current) {
  571. pos = current -> assigned;
  572. tspq = atomic_load_queue(&thread_status[pos].queue);
  573. if ((BLASULONG)tspq > 1) {
  574. pthread_mutex_lock (&thread_status[pos].lock);
  575. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  576. #ifdef MONITOR
  577. num_suspend ++;
  578. #endif
  579. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  580. thread_status[pos].status = THREAD_STATUS_WAKEUP;
  581. pthread_cond_signal(&thread_status[pos].wakeup);
  582. }
  583. }
  584. pthread_mutex_unlock(&thread_status[pos].lock);
  585. }
  586. current = current -> next;
  587. }
  588. return 0;
  589. }
  590. int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
  591. blas_queue_t * tsqq;
  592. while ((num > 0) && queue) {
  593. tsqq = atomic_load_queue(&thread_status[queue->assigned].queue);
  594. while(tsqq) {
  595. YIELDING;
  596. tsqq = atomic_load_queue(&thread_status[queue->assigned].queue);
  597. };
  598. queue = queue -> next;
  599. num --;
  600. }
  601. MB;
  602. #ifdef SMP_DEBUG
  603. fprintf(STDERR, "Done.\n\n");
  604. #endif
  605. return 0;
  606. }
  607. /* Execute Threads */
  608. int exec_blas(BLASLONG num, blas_queue_t *queue){
  609. #ifdef SMP_SERVER
  610. // Handle lazy re-init of the thread-pool after a POSIX fork
  611. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  612. #endif
  613. int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
  614. #ifdef TIMING_DEBUG
  615. BLASULONG start, stop;
  616. #endif
  617. if ((num <= 0) || (queue == NULL)) return 0;
  618. #ifdef SMP_DEBUG
  619. fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
  620. #endif
  621. #ifdef __ELF__
  622. if (omp_in_parallel && (num > 1)) {
  623. if (omp_in_parallel() > 0) {
  624. fprintf(stderr,
  625. "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
  626. "Please rebuild the library with USE_OPENMP=1 option.\n");
  627. }
  628. }
  629. #endif
  630. if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
  631. #ifdef TIMING_DEBUG
  632. start = rpcc();
  633. fprintf(STDERR, "\n");
  634. #endif
  635. routine = queue -> routine;
  636. if (queue -> mode & BLAS_LEGACY) {
  637. legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
  638. } else
  639. if (queue -> mode & BLAS_PTHREAD) {
  640. void (*pthreadcompat)(void *) = queue -> routine;
  641. (pthreadcompat)(queue -> args);
  642. } else
  643. (routine)(queue -> args, queue -> range_m, queue -> range_n,
  644. queue -> sa, queue -> sb, 0);
  645. #ifdef TIMING_DEBUG
  646. stop = rpcc();
  647. #endif
  648. if ((num > 1) && queue -> next) {
  649. exec_blas_async_wait(num - 1, queue -> next);
  650. // arm: make sure results from other threads are visible
  651. MB;
  652. }
  653. #ifdef TIMING_DEBUG
  654. fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
  655. start, stop,
  656. stop - start);
  657. #endif
  658. return 0;
  659. }
  660. void goto_set_num_threads(int num_threads) {
  661. long i;
  662. #ifdef SMP_SERVER
  663. // Handle lazy re-init of the thread-pool after a POSIX fork
  664. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  665. #endif
  666. if (num_threads < 1) num_threads = blas_num_threads;
  667. #ifndef NO_AFFINITY
  668. if (num_threads == 1) {
  669. if (blas_cpu_number == 1){
  670. //OpenBLAS is already single thread.
  671. return;
  672. }else{
  673. //From multi-threads to single thread
  674. //Restore the original affinity mask
  675. gotoblas_set_affinity(-1);
  676. }
  677. }
  678. #endif
  679. if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
  680. if (num_threads > blas_num_threads) {
  681. LOCK_COMMAND(&server_lock);
  682. increased_threads = 1;
  683. for(i = blas_num_threads - 1; i < num_threads - 1; i++){
  684. atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
  685. thread_status[i].status = THREAD_STATUS_WAKEUP;
  686. pthread_mutex_init(&thread_status[i].lock, NULL);
  687. pthread_cond_init (&thread_status[i].wakeup, NULL);
  688. #ifdef NEED_STACKATTR
  689. pthread_create(&blas_threads[i], &attr,
  690. &blas_thread_server, (void *)i);
  691. #else
  692. pthread_create(&blas_threads[i], NULL,
  693. &blas_thread_server, (void *)i);
  694. #endif
  695. }
  696. blas_num_threads = num_threads;
  697. UNLOCK_COMMAND(&server_lock);
  698. }
  699. #ifndef NO_AFFINITY
  700. if(blas_cpu_number == 1 && num_threads > 1){
  701. //Restore the thread 0 affinity.
  702. gotoblas_set_affinity(0);
  703. }
  704. #endif
  705. blas_cpu_number = num_threads;
  706. #if defined(ARCH_MIPS64)
  707. //set parameters for different number of threads.
  708. blas_set_parameter();
  709. #endif
  710. }
  711. void openblas_set_num_threads(int num_threads) {
  712. goto_set_num_threads(num_threads);
  713. }
  714. /* Compatible function with pthread_create / join */
  715. int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
  716. blas_queue_t queue[MAX_CPU_NUMBER];
  717. int i;
  718. if (numthreads <= 0) return 0;
  719. #ifdef SMP
  720. if (blas_cpu_number == 0) blas_get_cpu_number();
  721. #ifdef SMP_SERVER
  722. if (blas_server_avail == 0) blas_thread_init();
  723. #endif
  724. #endif
  725. for (i = 0; i < numthreads; i ++) {
  726. queue[i].mode = BLAS_PTHREAD;
  727. queue[i].routine = function;
  728. queue[i].args = args;
  729. queue[i].range_m = NULL;
  730. queue[i].range_n = NULL;
  731. queue[i].sa = args;
  732. queue[i].sb = args;
  733. queue[i].next = &queue[i + 1];
  734. args += stride;
  735. }
  736. queue[numthreads - 1].next = NULL;
  737. exec_blas(numthreads, queue);
  738. return 0;
  739. }
  740. /* Shutdown procedure, but user don't have to call this routine. The */
  741. /* kernel automatically kill threads. */
  742. int BLASFUNC(blas_thread_shutdown)(void){
  743. int i;
  744. if (!blas_server_avail) return 0;
  745. LOCK_COMMAND(&server_lock);
  746. for (i = 0; i < blas_num_threads - 1; i++) {
  747. pthread_mutex_lock (&thread_status[i].lock);
  748. atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
  749. thread_status[i].status = THREAD_STATUS_WAKEUP;
  750. pthread_cond_signal (&thread_status[i].wakeup);
  751. pthread_mutex_unlock(&thread_status[i].lock);
  752. }
  753. for(i = 0; i < blas_num_threads - 1; i++){
  754. pthread_join(blas_threads[i], NULL);
  755. }
  756. for(i = 0; i < blas_num_threads - 1; i++){
  757. pthread_mutex_destroy(&thread_status[i].lock);
  758. pthread_cond_destroy (&thread_status[i].wakeup);
  759. }
  760. #ifdef NEED_STACKATTR
  761. pthread_attr_destory(&attr);
  762. #endif
  763. blas_server_avail = 0;
  764. UNLOCK_COMMAND(&server_lock);
  765. return 0;
  766. }
  767. #endif