You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

blas_server.c 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #include "common.h"
  66. #ifdef OS_LINUX
  67. #include <dlfcn.h>
  68. #include <sys/resource.h>
  69. #endif
  70. #ifndef likely
  71. #ifdef __GNUC__
  72. #define likely(x) __builtin_expect(!!(x), 1)
  73. #else
  74. #define likely(x) (x)
  75. #endif
  76. #endif
  77. #ifndef unlikely
  78. #ifdef __GNUC__
  79. #define unlikely(x) __builtin_expect(!!(x), 0)
  80. #else
  81. #define unlikely(x) (x)
  82. #endif
  83. #endif
  84. #ifdef SMP_SERVER
  85. #undef MONITOR
  86. #undef TIMING
  87. #undef TIMING_DEBUG
  88. #undef NEED_STACKATTR
  89. #define ATTRIBUTE_SIZE 128
  90. /* This is a thread server model implementation. The threads are */
  91. /* spawned at first access to blas library, and still remains until */
  92. /* destruction routine is called. The number of threads are */
  93. /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
  94. /* jobs is queued. */
  95. /* We need this grobal for cheking if initialization is finished. */
  96. int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
  97. /* Local Variables */
  98. #if defined(USE_PTHREAD_LOCK)
  99. static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
  100. #elif defined(USE_PTHREAD_SPINLOCK)
  101. static pthread_spinlock_t server_lock = 0;
  102. #else
  103. static unsigned long server_lock = 0;
  104. #endif
  105. #define THREAD_STATUS_SLEEP 2
  106. #define THREAD_STATUS_WAKEUP 4
  107. static pthread_t blas_threads [MAX_CPU_NUMBER];
  108. typedef struct {
  109. blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE)));
  110. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  111. int node;
  112. #endif
  113. volatile long status;
  114. pthread_mutex_t lock;
  115. pthread_cond_t wakeup;
  116. } thread_status_t;
  117. static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
  118. #ifndef THREAD_TIMEOUT
  119. #define THREAD_TIMEOUT 28
  120. #endif
  121. static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
  122. #ifdef MONITOR
  123. /* Monitor is a function to see thread's status for every seconds. */
  124. /* Usually it turns off and it's for debugging. */
  125. static pthread_t monitor_thread;
  126. static int main_status[MAX_CPU_NUMBER];
  127. #define MAIN_ENTER 0x01
  128. #define MAIN_EXIT 0x02
  129. #define MAIN_TRYLOCK 0x03
  130. #define MAIN_LOCKSUCCESS 0x04
  131. #define MAIN_QUEUING 0x05
  132. #define MAIN_RECEIVING 0x06
  133. #define MAIN_RUNNING1 0x07
  134. #define MAIN_RUNNING2 0x08
  135. #define MAIN_RUNNING3 0x09
  136. #define MAIN_WAITING 0x0a
  137. #define MAIN_SLEEPING 0x0b
  138. #define MAIN_FINISH 0x0c
  139. #define MAIN_DONE 0x0d
  140. #endif
  141. #define BLAS_QUEUE_FINISHED 3
  142. #define BLAS_QUEUE_RUNNING 4
  143. #ifdef TIMING
  144. BLASLONG exit_time[MAX_CPU_NUMBER];
  145. #endif
  146. static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
  147. if (!(mode & BLAS_COMPLEX)){
  148. #ifdef EXPRECISION
  149. if (mode & BLAS_XDOUBLE){
  150. /* REAL / Extended Double */
  151. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
  152. xdouble *, BLASLONG, xdouble *, BLASLONG,
  153. xdouble *, BLASLONG, void *) = func;
  154. afunc(args -> m, args -> n, args -> k,
  155. ((xdouble *)args -> alpha)[0],
  156. args -> a, args -> lda,
  157. args -> b, args -> ldb,
  158. args -> c, args -> ldc, sb);
  159. } else
  160. #endif
  161. if (mode & BLAS_DOUBLE){
  162. /* REAL / Double */
  163. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
  164. double *, BLASLONG, double *, BLASLONG,
  165. double *, BLASLONG, void *) = func;
  166. afunc(args -> m, args -> n, args -> k,
  167. ((double *)args -> alpha)[0],
  168. args -> a, args -> lda,
  169. args -> b, args -> ldb,
  170. args -> c, args -> ldc, sb);
  171. } else {
  172. /* REAL / Single */
  173. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
  174. float *, BLASLONG, float *, BLASLONG,
  175. float *, BLASLONG, void *) = func;
  176. afunc(args -> m, args -> n, args -> k,
  177. ((float *)args -> alpha)[0],
  178. args -> a, args -> lda,
  179. args -> b, args -> ldb,
  180. args -> c, args -> ldc, sb);
  181. }
  182. } else {
  183. #ifdef EXPRECISION
  184. if (mode & BLAS_XDOUBLE){
  185. /* COMPLEX / Extended Double */
  186. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
  187. xdouble *, BLASLONG, xdouble *, BLASLONG,
  188. xdouble *, BLASLONG, void *) = func;
  189. afunc(args -> m, args -> n, args -> k,
  190. ((xdouble *)args -> alpha)[0],
  191. ((xdouble *)args -> alpha)[1],
  192. args -> a, args -> lda,
  193. args -> b, args -> ldb,
  194. args -> c, args -> ldc, sb);
  195. } else
  196. #endif
  197. if (mode & BLAS_DOUBLE){
  198. /* COMPLEX / Double */
  199. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
  200. double *, BLASLONG, double *, BLASLONG,
  201. double *, BLASLONG, void *) = func;
  202. afunc(args -> m, args -> n, args -> k,
  203. ((double *)args -> alpha)[0],
  204. ((double *)args -> alpha)[1],
  205. args -> a, args -> lda,
  206. args -> b, args -> ldb,
  207. args -> c, args -> ldc, sb);
  208. } else {
  209. /* COMPLEX / Single */
  210. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
  211. float *, BLASLONG, float *, BLASLONG,
  212. float *, BLASLONG, void *) = func;
  213. afunc(args -> m, args -> n, args -> k,
  214. ((float *)args -> alpha)[0],
  215. ((float *)args -> alpha)[1],
  216. args -> a, args -> lda,
  217. args -> b, args -> ldb,
  218. args -> c, args -> ldc, sb);
  219. }
  220. }
  221. }
  222. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  223. int gotoblas_set_affinity(int);
  224. int gotoblas_set_affinity2(int);
  225. int get_node(void);
  226. #endif
  227. static int increased_threads = 0;
  228. static int blas_thread_server(void *arg){
  229. /* Thread identifier */
  230. BLASLONG cpu = (BLASLONG)arg;
  231. unsigned int last_tick;
  232. void *buffer, *sa, *sb;
  233. blas_queue_t *queue;
  234. #ifdef TIMING_DEBUG
  235. unsigned long start, stop;
  236. #endif
  237. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  238. if (!increased_threads)
  239. thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
  240. else
  241. thread_status[cpu].node = gotoblas_set_affinity(-1);
  242. #endif
  243. #ifdef MONITOR
  244. main_status[cpu] = MAIN_ENTER;
  245. #endif
  246. buffer = blas_memory_alloc(2);
  247. #ifdef SMP_DEBUG
  248. fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
  249. #endif
  250. while (1){
  251. #ifdef MONITOR
  252. main_status[cpu] = MAIN_QUEUING;
  253. #endif
  254. #ifdef TIMING
  255. exit_time[cpu] = rpcc();
  256. #endif
  257. last_tick = (unsigned int)rpcc();
  258. while (!thread_status[cpu].queue) {
  259. YIELDING;
  260. if ((unsigned int)rpcc() - last_tick > thread_timeout) {
  261. pthread_mutex_lock (&thread_status[cpu].lock);
  262. if (!thread_status[cpu].queue) {
  263. thread_status[cpu].status = THREAD_STATUS_SLEEP;
  264. while (thread_status[cpu].status == THREAD_STATUS_SLEEP) {
  265. #ifdef MONITOR
  266. main_status[cpu] = MAIN_SLEEPING;
  267. #endif
  268. pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
  269. }
  270. }
  271. pthread_mutex_unlock(&thread_status[cpu].lock);
  272. last_tick = (unsigned int)rpcc();
  273. }
  274. }
  275. queue = thread_status[cpu].queue;
  276. if ((long)queue == -1) break;
  277. #ifdef MONITOR
  278. main_status[cpu] = MAIN_RECEIVING;
  279. #endif
  280. #ifdef TIMING_DEBUG
  281. start = rpcc();
  282. #endif
  283. if (queue) {
  284. int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
  285. thread_status[cpu].queue = (blas_queue_t *)1;
  286. sa = queue -> sa;
  287. sb = queue -> sb;
  288. #ifdef SMP_DEBUG
  289. if (queue -> args) {
  290. fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
  291. cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
  292. }
  293. #endif
  294. #ifdef CONSISTENT_FPCSR
  295. __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
  296. __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
  297. #endif
  298. #ifdef MONITOR
  299. main_status[cpu] = MAIN_RUNNING1;
  300. #endif
  301. if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
  302. if (sb == NULL) {
  303. if (!(queue -> mode & BLAS_COMPLEX)){
  304. #ifdef EXPRECISION
  305. if (queue -> mode & BLAS_XDOUBLE){
  306. sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
  307. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  308. } else
  309. #endif
  310. if (queue -> mode & BLAS_DOUBLE){
  311. sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
  312. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  313. } else {
  314. sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
  315. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  316. }
  317. } else {
  318. #ifdef EXPRECISION
  319. if (queue -> mode & BLAS_XDOUBLE){
  320. sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
  321. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  322. } else
  323. #endif
  324. if (queue -> mode & BLAS_DOUBLE){
  325. sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
  326. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  327. } else {
  328. sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
  329. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  330. }
  331. }
  332. queue->sb=sb;
  333. }
  334. #ifdef MONITOR
  335. main_status[cpu] = MAIN_RUNNING2;
  336. #endif
  337. if (queue -> mode & BLAS_LEGACY) {
  338. legacy_exec(routine, queue -> mode, queue -> args, sb);
  339. } else
  340. if (queue -> mode & BLAS_PTHREAD) {
  341. void (*pthreadcompat)(void *) = queue -> routine;
  342. (pthreadcompat)(queue -> args);
  343. } else
  344. (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
  345. #ifdef SMP_DEBUG
  346. fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
  347. #endif
  348. #ifdef MONITOR
  349. main_status[cpu] = MAIN_FINISH;
  350. #endif
  351. thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
  352. WMB;
  353. }
  354. #ifdef MONITOR
  355. main_status[cpu] = MAIN_DONE;
  356. #endif
  357. #ifdef TIMING_DEBUG
  358. stop = rpcc();
  359. fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
  360. start, stop,
  361. stop - start);
  362. #endif
  363. }
  364. /* Shutdown procedure */
  365. #ifdef SMP_DEBUG
  366. fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
  367. #endif
  368. blas_memory_free(buffer);
  369. //pthread_exit(NULL);
  370. return 0;
  371. }
  372. #ifdef MONITOR
  373. static BLASLONG num_suspend = 0;
  374. static int blas_monitor(void *arg){
  375. int i;
  376. while(1){
  377. for (i = 0; i < blas_num_threads - 1; i++){
  378. switch (main_status[i]) {
  379. case MAIN_ENTER :
  380. fprintf(STDERR, "THREAD[%2d] : Entering.\n", i);
  381. break;
  382. case MAIN_EXIT :
  383. fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i);
  384. break;
  385. case MAIN_TRYLOCK :
  386. fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i);
  387. break;
  388. case MAIN_QUEUING :
  389. fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i);
  390. break;
  391. case MAIN_RECEIVING :
  392. fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i);
  393. break;
  394. case MAIN_RUNNING1 :
  395. fprintf(STDERR, "THREAD[%2d] : Running1.\n", i);
  396. break;
  397. case MAIN_RUNNING2 :
  398. fprintf(STDERR, "THREAD[%2d] : Running2.\n", i);
  399. break;
  400. case MAIN_RUNNING3 :
  401. fprintf(STDERR, "THREAD[%2d] : Running3.\n", i);
  402. break;
  403. case MAIN_WAITING :
  404. fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i);
  405. break;
  406. case MAIN_SLEEPING :
  407. fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i);
  408. break;
  409. case MAIN_FINISH :
  410. fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i);
  411. break;
  412. case MAIN_DONE :
  413. fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i);
  414. break;
  415. }
  416. fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend);
  417. }
  418. sleep(1);
  419. }
  420. return 0;
  421. }
  422. #endif
  423. /* Initializing routine */
  424. int blas_thread_init(void){
  425. BLASLONG i;
  426. int ret;
  427. #ifdef NEED_STACKATTR
  428. pthread_attr_t attr;
  429. #endif
  430. if (blas_server_avail) return 0;
  431. #ifdef NEED_STACKATTR
  432. pthread_attr_init(&attr);
  433. pthread_attr_setguardsize(&attr, 0x1000U);
  434. pthread_attr_setstacksize( &attr, 0x1000U);
  435. #endif
  436. LOCK_COMMAND(&server_lock);
  437. if (!blas_server_avail){
  438. env_var_t p;
  439. if (readenv(p,"THREAD_TIMEOUT")) {
  440. thread_timeout = atoi(p);
  441. if (thread_timeout < 4) thread_timeout = 4;
  442. if (thread_timeout > 30) thread_timeout = 30;
  443. thread_timeout = (1 << thread_timeout);
  444. }else{
  445. if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
  446. thread_timeout = atoi(p);
  447. if (thread_timeout < 4) thread_timeout = 4;
  448. if (thread_timeout > 30) thread_timeout = 30;
  449. thread_timeout = (1 << thread_timeout);
  450. }
  451. }
  452. for(i = 0; i < blas_num_threads - 1; i++){
  453. thread_status[i].queue = (blas_queue_t *)NULL;
  454. thread_status[i].status = THREAD_STATUS_WAKEUP;
  455. pthread_mutex_init(&thread_status[i].lock, NULL);
  456. pthread_cond_init (&thread_status[i].wakeup, NULL);
  457. #ifdef NEED_STACKATTR
  458. ret=pthread_create(&blas_threads[i], &attr,
  459. (void *)&blas_thread_server, (void *)i);
  460. #else
  461. ret=pthread_create(&blas_threads[i], NULL,
  462. (void *)&blas_thread_server, (void *)i);
  463. #endif
  464. if(ret!=0){
  465. fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
  466. exit(1);
  467. }
  468. }
  469. #ifdef MONITOR
  470. pthread_create(&monitor_thread, NULL,
  471. (void *)&blas_monitor, (void *)NULL);
  472. #endif
  473. blas_server_avail = 1;
  474. }
  475. UNLOCK_COMMAND(&server_lock);
  476. return 0;
  477. }
  478. /*
  479. User can call one of two routines.
  480. exec_blas_async ... immediately returns after jobs are queued.
  481. exec_blas ... returns after jobs are finished.
  482. */
  483. static BLASULONG exec_queue_lock = 0;
  484. int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
  485. #ifdef SMP_SERVER
  486. // Handle lazy re-init of the thread-pool after a POSIX fork
  487. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  488. #endif
  489. BLASLONG i = 0;
  490. blas_queue_t *current = queue;
  491. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  492. int node = get_node();
  493. int nodes = get_num_nodes();
  494. #endif
  495. #ifdef SMP_DEBUG
  496. int exec_count = 0;
  497. fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
  498. #endif
  499. blas_lock(&exec_queue_lock);
  500. while (queue) {
  501. queue -> position = pos;
  502. #ifdef CONSISTENT_FPCSR
  503. __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
  504. __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
  505. #endif
  506. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  507. /* Node Mapping Mode */
  508. if (queue -> mode & BLAS_NODE) {
  509. do {
  510. while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++;
  511. if (i < blas_num_threads - 1) break;
  512. i ++;
  513. if (i >= blas_num_threads - 1) {
  514. i = 0;
  515. node ++;
  516. if (node >= nodes) node = 0;
  517. }
  518. } while (1);
  519. } else {
  520. while(thread_status[i].queue) {
  521. i ++;
  522. if (i >= blas_num_threads - 1) i = 0;
  523. }
  524. }
  525. #else
  526. while(thread_status[i].queue) {
  527. i ++;
  528. if (i >= blas_num_threads - 1) i = 0;
  529. }
  530. #endif
  531. queue -> assigned = i;
  532. WMB;
  533. thread_status[i].queue = queue;
  534. WMB;
  535. queue = queue -> next;
  536. pos ++;
  537. #ifdef SMP_DEBUG
  538. exec_count ++;
  539. #endif
  540. }
  541. blas_unlock(&exec_queue_lock);
  542. #ifdef SMP_DEBUG
  543. fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
  544. #endif
  545. while (current) {
  546. pos = current -> assigned;
  547. if ((BLASULONG)thread_status[pos].queue > 1) {
  548. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  549. pthread_mutex_lock (&thread_status[pos].lock);
  550. #ifdef MONITOR
  551. num_suspend ++;
  552. #endif
  553. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  554. thread_status[pos].status = THREAD_STATUS_WAKEUP;
  555. pthread_cond_signal(&thread_status[pos].wakeup);
  556. }
  557. pthread_mutex_unlock(&thread_status[pos].lock);
  558. }
  559. }
  560. current = current -> next;
  561. }
  562. return 0;
  563. }
  564. int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
  565. while ((num > 0) && queue) {
  566. while(thread_status[queue -> assigned].queue) {
  567. YIELDING;
  568. };
  569. queue = queue -> next;
  570. num --;
  571. }
  572. #ifdef SMP_DEBUG
  573. fprintf(STDERR, "Done.\n\n");
  574. #endif
  575. return 0;
  576. }
  577. /* Execute Threads */
  578. int exec_blas(BLASLONG num, blas_queue_t *queue){
  579. #ifdef SMP_SERVER
  580. // Handle lazy re-init of the thread-pool after a POSIX fork
  581. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  582. #endif
  583. int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
  584. #ifdef TIMING_DEBUG
  585. BLASULONG start, stop;
  586. #endif
  587. if ((num <= 0) || (queue == NULL)) return 0;
  588. #ifdef SMP_DEBUG
  589. fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
  590. #endif
  591. #ifdef __ELF__
  592. if (omp_in_parallel && (num > 1)) {
  593. if (omp_in_parallel() > 0) {
  594. fprintf(stderr,
  595. "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
  596. "Please rebuild the library with USE_OPENMP=1 option.\n");
  597. }
  598. }
  599. #endif
  600. if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
  601. #ifdef TIMING_DEBUG
  602. start = rpcc();
  603. fprintf(STDERR, "\n");
  604. #endif
  605. routine = queue -> routine;
  606. if (queue -> mode & BLAS_LEGACY) {
  607. legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
  608. } else
  609. if (queue -> mode & BLAS_PTHREAD) {
  610. void (*pthreadcompat)(void *) = queue -> routine;
  611. (pthreadcompat)(queue -> args);
  612. } else
  613. (routine)(queue -> args, queue -> range_m, queue -> range_n,
  614. queue -> sa, queue -> sb, 0);
  615. #ifdef TIMING_DEBUG
  616. stop = rpcc();
  617. #endif
  618. if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
  619. #ifdef TIMING_DEBUG
  620. fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
  621. start, stop,
  622. stop - start);
  623. #endif
  624. return 0;
  625. }
  626. void goto_set_num_threads(int num_threads) {
  627. long i;
  628. if (num_threads < 1) num_threads = blas_num_threads;
  629. #ifndef NO_AFFINITY
  630. if (num_threads == 1) {
  631. if (blas_cpu_number == 1){
  632. //OpenBLAS is already single thread.
  633. return;
  634. }else{
  635. //From multi-threads to single thread
  636. //Restore the original affinity mask
  637. gotoblas_set_affinity(-1);
  638. }
  639. }
  640. #endif
  641. if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
  642. if (num_threads > blas_num_threads) {
  643. LOCK_COMMAND(&server_lock);
  644. increased_threads = 1;
  645. for(i = blas_num_threads - 1; i < num_threads - 1; i++){
  646. thread_status[i].queue = (blas_queue_t *)NULL;
  647. thread_status[i].status = THREAD_STATUS_WAKEUP;
  648. pthread_mutex_init(&thread_status[i].lock, NULL);
  649. pthread_cond_init (&thread_status[i].wakeup, NULL);
  650. #ifdef NEED_STACKATTR
  651. pthread_create(&blas_threads[i], &attr,
  652. (void *)&blas_thread_server, (void *)i);
  653. #else
  654. pthread_create(&blas_threads[i], NULL,
  655. (void *)&blas_thread_server, (void *)i);
  656. #endif
  657. }
  658. blas_num_threads = num_threads;
  659. UNLOCK_COMMAND(&server_lock);
  660. }
  661. #ifndef NO_AFFINITY
  662. if(blas_cpu_number == 1 && num_threads > 1){
  663. //Restore the thread 0 affinity.
  664. gotoblas_set_affinity(0);
  665. }
  666. #endif
  667. blas_cpu_number = num_threads;
  668. #if defined(ARCH_MIPS64)
  669. //set parameters for different number of threads.
  670. blas_set_parameter();
  671. #endif
  672. }
  673. void openblas_set_num_threads(int num_threads) {
  674. goto_set_num_threads(num_threads);
  675. }
  676. /* Compatible function with pthread_create / join */
  677. int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
  678. blas_queue_t queue[MAX_CPU_NUMBER];
  679. int i;
  680. if (numthreads <= 0) return 0;
  681. #ifdef SMP
  682. if (blas_cpu_number == 0) blas_get_cpu_number();
  683. #ifdef SMP_SERVER
  684. if (blas_server_avail == 0) blas_thread_init();
  685. #endif
  686. #endif
  687. for (i = 0; i < numthreads; i ++) {
  688. queue[i].mode = BLAS_PTHREAD;
  689. queue[i].routine = function;
  690. queue[i].args = args;
  691. queue[i].range_m = NULL;
  692. queue[i].range_n = NULL;
  693. queue[i].sa = args;
  694. queue[i].sb = args;
  695. queue[i].next = &queue[i + 1];
  696. args += stride;
  697. }
  698. queue[numthreads - 1].next = NULL;
  699. exec_blas(numthreads, queue);
  700. return 0;
  701. }
  702. /* Shutdown procedure, but user don't have to call this routine. The */
  703. /* kernel automatically kill threads. */
  704. int BLASFUNC(blas_thread_shutdown)(void){
  705. int i;
  706. if (!blas_server_avail) return 0;
  707. LOCK_COMMAND(&server_lock);
  708. for (i = 0; i < blas_num_threads - 1; i++) {
  709. blas_lock(&exec_queue_lock);
  710. thread_status[i].queue = (blas_queue_t *)-1;
  711. blas_unlock(&exec_queue_lock);
  712. pthread_mutex_lock (&thread_status[i].lock);
  713. thread_status[i].status = THREAD_STATUS_WAKEUP;
  714. pthread_cond_signal (&thread_status[i].wakeup);
  715. pthread_mutex_unlock(&thread_status[i].lock);
  716. }
  717. for(i = 0; i < blas_num_threads - 1; i++){
  718. pthread_join(blas_threads[i], NULL);
  719. }
  720. for(i = 0; i < blas_num_threads - 1; i++){
  721. pthread_mutex_destroy(&thread_status[i].lock);
  722. pthread_cond_destroy (&thread_status[i].wakeup);
  723. }
  724. #ifdef NEED_STACKATTR
  725. pthread_attr_destory(&attr);
  726. #endif
  727. blas_server_avail = 0;
  728. UNLOCK_COMMAND(&server_lock);
  729. return 0;
  730. }
  731. #endif