You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

blas_server.c 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945
  1. /*****************************************************************************
  2. Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************/
  28. /* Copyright 2009, 2010 The University of Texas at Austin. */
  29. /* All rights reserved. */
  30. /* */
  31. /* Redistribution and use in source and binary forms, with or */
  32. /* without modification, are permitted provided that the following */
  33. /* conditions are met: */
  34. /* */
  35. /* 1. Redistributions of source code must retain the above */
  36. /* copyright notice, this list of conditions and the following */
  37. /* disclaimer. */
  38. /* */
  39. /* 2. Redistributions in binary form must reproduce the above */
  40. /* copyright notice, this list of conditions and the following */
  41. /* disclaimer in the documentation and/or other materials */
  42. /* provided with the distribution. */
  43. /* */
  44. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  45. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  46. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  47. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  48. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  49. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  50. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  51. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  52. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  53. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  54. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  55. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  56. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  57. /* POSSIBILITY OF SUCH DAMAGE. */
  58. /* */
  59. /* The views and conclusions contained in the software and */
  60. /* documentation are those of the authors and should not be */
  61. /* interpreted as representing official policies, either expressed */
  62. /* or implied, of The University of Texas at Austin. */
  63. /*********************************************************************/
  64. #include "common.h"
  65. #ifdef OS_LINUX
  66. #include <dlfcn.h>
  67. #include <sys/resource.h>
  68. #endif
  69. #ifndef likely
  70. #ifdef __GNUC__
  71. #define likely(x) __builtin_expect(!!(x), 1)
  72. #else
  73. #define likely(x) (x)
  74. #endif
  75. #endif
  76. #ifndef unlikely
  77. #ifdef __GNUC__
  78. #define unlikely(x) __builtin_expect(!!(x), 0)
  79. #else
  80. #define unlikely(x) (x)
  81. #endif
  82. #endif
  83. #ifdef SMP_SERVER
  84. #undef MONITOR
  85. #undef TIMING
  86. #undef TIMING_DEBUG
  87. #undef NEED_STACKATTR
  88. #define ATTRIBUTE_SIZE 128
  89. /* This is a thread server model implementation. The threads are */
  90. /* spawned at first access to blas library, and still remains until */
  91. /* destruction routine is called. The number of threads are */
  92. /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
  93. /* jobs is queued. */
  94. /* We need this grobal for cheking if initialization is finished. */
  95. int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
  96. /* Local Variables */
  97. #if defined(USE_PTHREAD_LOCK)
  98. static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
  99. #elif defined(USE_PTHREAD_SPINLOCK)
  100. static pthread_spinlock_t server_lock = 0;
  101. #else
  102. static unsigned long server_lock = 0;
  103. #endif
  104. #define THREAD_STATUS_SLEEP 2
  105. #define THREAD_STATUS_WAKEUP 4
  106. static pthread_t blas_threads [MAX_CPU_NUMBER];
  107. typedef struct {
  108. blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE)));
  109. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  110. int node;
  111. #endif
  112. volatile long status;
  113. pthread_mutex_t lock;
  114. pthread_cond_t wakeup;
  115. } thread_status_t;
  116. static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
  117. #ifndef THREAD_TIMEOUT
  118. #define THREAD_TIMEOUT 28
  119. #endif
  120. static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
  121. #ifdef MONITOR
  122. /* Monitor is a function to see thread's status for every seconds. */
  123. /* Usually it turns off and it's for debugging. */
  124. static pthread_t monitor_thread;
  125. static int main_status[MAX_CPU_NUMBER];
  126. #define MAIN_ENTER 0x01
  127. #define MAIN_EXIT 0x02
  128. #define MAIN_TRYLOCK 0x03
  129. #define MAIN_LOCKSUCCESS 0x04
  130. #define MAIN_QUEUING 0x05
  131. #define MAIN_RECEIVING 0x06
  132. #define MAIN_RUNNING1 0x07
  133. #define MAIN_RUNNING2 0x08
  134. #define MAIN_RUNNING3 0x09
  135. #define MAIN_WAITING 0x0a
  136. #define MAIN_SLEEPING 0x0b
  137. #define MAIN_FINISH 0x0c
  138. #define MAIN_DONE 0x0d
  139. #endif
  140. #define BLAS_QUEUE_FINISHED 3
  141. #define BLAS_QUEUE_RUNNING 4
  142. #ifdef TIMING
  143. BLASLONG exit_time[MAX_CPU_NUMBER];
  144. #endif
  145. static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
  146. if (!(mode & BLAS_COMPLEX)){
  147. #ifdef EXPRECISION
  148. if (mode & BLAS_XDOUBLE){
  149. /* REAL / Extended Double */
  150. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
  151. xdouble *, BLASLONG, xdouble *, BLASLONG,
  152. xdouble *, BLASLONG, void *) = func;
  153. afunc(args -> m, args -> n, args -> k,
  154. ((xdouble *)args -> alpha)[0],
  155. args -> a, args -> lda,
  156. args -> b, args -> ldb,
  157. args -> c, args -> ldc, sb);
  158. } else
  159. #endif
  160. if (mode & BLAS_DOUBLE){
  161. /* REAL / Double */
  162. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
  163. double *, BLASLONG, double *, BLASLONG,
  164. double *, BLASLONG, void *) = func;
  165. afunc(args -> m, args -> n, args -> k,
  166. ((double *)args -> alpha)[0],
  167. args -> a, args -> lda,
  168. args -> b, args -> ldb,
  169. args -> c, args -> ldc, sb);
  170. } else {
  171. /* REAL / Single */
  172. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
  173. float *, BLASLONG, float *, BLASLONG,
  174. float *, BLASLONG, void *) = func;
  175. afunc(args -> m, args -> n, args -> k,
  176. ((float *)args -> alpha)[0],
  177. args -> a, args -> lda,
  178. args -> b, args -> ldb,
  179. args -> c, args -> ldc, sb);
  180. }
  181. } else {
  182. #ifdef EXPRECISION
  183. if (mode & BLAS_XDOUBLE){
  184. /* COMPLEX / Extended Double */
  185. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
  186. xdouble *, BLASLONG, xdouble *, BLASLONG,
  187. xdouble *, BLASLONG, void *) = func;
  188. afunc(args -> m, args -> n, args -> k,
  189. ((xdouble *)args -> alpha)[0],
  190. ((xdouble *)args -> alpha)[1],
  191. args -> a, args -> lda,
  192. args -> b, args -> ldb,
  193. args -> c, args -> ldc, sb);
  194. } else
  195. #endif
  196. if (mode & BLAS_DOUBLE){
  197. /* COMPLEX / Double */
  198. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
  199. double *, BLASLONG, double *, BLASLONG,
  200. double *, BLASLONG, void *) = func;
  201. afunc(args -> m, args -> n, args -> k,
  202. ((double *)args -> alpha)[0],
  203. ((double *)args -> alpha)[1],
  204. args -> a, args -> lda,
  205. args -> b, args -> ldb,
  206. args -> c, args -> ldc, sb);
  207. } else {
  208. /* COMPLEX / Single */
  209. void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
  210. float *, BLASLONG, float *, BLASLONG,
  211. float *, BLASLONG, void *) = func;
  212. afunc(args -> m, args -> n, args -> k,
  213. ((float *)args -> alpha)[0],
  214. ((float *)args -> alpha)[1],
  215. args -> a, args -> lda,
  216. args -> b, args -> ldb,
  217. args -> c, args -> ldc, sb);
  218. }
  219. }
  220. }
  221. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  222. int gotoblas_set_affinity(int);
  223. int gotoblas_set_affinity2(int);
  224. int get_node(void);
  225. #endif
  226. static int increased_threads = 0;
  227. static int blas_thread_server(void *arg){
  228. /* Thread identifier */
  229. BLASLONG cpu = (BLASLONG)arg;
  230. unsigned int last_tick;
  231. void *buffer, *sa, *sb;
  232. blas_queue_t *queue;
  233. #ifdef TIMING_DEBUG
  234. unsigned long start, stop;
  235. #endif
  236. #if defined(OS_LINUX) && !defined(NO_AFFINITY)
  237. if (!increased_threads)
  238. thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
  239. else
  240. thread_status[cpu].node = gotoblas_set_affinity(-1);
  241. #endif
  242. #ifdef MONITOR
  243. main_status[cpu] = MAIN_ENTER;
  244. #endif
  245. buffer = blas_memory_alloc(2);
  246. #ifdef SMP_DEBUG
  247. fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
  248. #endif
  249. while (1){
  250. #ifdef MONITOR
  251. main_status[cpu] = MAIN_QUEUING;
  252. #endif
  253. #ifdef TIMING
  254. exit_time[cpu] = rpcc();
  255. #endif
  256. last_tick = (unsigned int)rpcc();
  257. while (!thread_status[cpu].queue) {
  258. YIELDING;
  259. if ((unsigned int)rpcc() - last_tick > thread_timeout) {
  260. pthread_mutex_lock (&thread_status[cpu].lock);
  261. if (!thread_status[cpu].queue) {
  262. thread_status[cpu].status = THREAD_STATUS_SLEEP;
  263. while (thread_status[cpu].status == THREAD_STATUS_SLEEP) {
  264. #ifdef MONITOR
  265. main_status[cpu] = MAIN_SLEEPING;
  266. #endif
  267. pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
  268. }
  269. }
  270. pthread_mutex_unlock(&thread_status[cpu].lock);
  271. last_tick = (unsigned int)rpcc();
  272. }
  273. }
  274. queue = thread_status[cpu].queue;
  275. if ((long)queue == -1) break;
  276. #ifdef MONITOR
  277. main_status[cpu] = MAIN_RECEIVING;
  278. #endif
  279. #ifdef TIMING_DEBUG
  280. start = rpcc();
  281. #endif
  282. if (queue) {
  283. int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
  284. thread_status[cpu].queue = (blas_queue_t *)1;
  285. sa = queue -> sa;
  286. sb = queue -> sb;
  287. #ifdef SMP_DEBUG
  288. if (queue -> args) {
  289. fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
  290. cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
  291. }
  292. #endif
  293. #ifdef CONSISTENT_FPCSR
  294. __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
  295. __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
  296. #endif
  297. #ifdef MONITOR
  298. main_status[cpu] = MAIN_RUNNING1;
  299. #endif
  300. if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
  301. if (sb == NULL) {
  302. if (!(queue -> mode & BLAS_COMPLEX)){
  303. #ifdef EXPRECISION
  304. if (queue -> mode & BLAS_XDOUBLE){
  305. sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
  306. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  307. } else
  308. #endif
  309. if (queue -> mode & BLAS_DOUBLE){
  310. sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
  311. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  312. } else {
  313. sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
  314. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  315. }
  316. } else {
  317. #ifdef EXPRECISION
  318. if (queue -> mode & BLAS_XDOUBLE){
  319. sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
  320. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  321. } else
  322. #endif
  323. if (queue -> mode & BLAS_DOUBLE){
  324. sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
  325. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  326. } else {
  327. sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
  328. + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
  329. }
  330. }
  331. queue->sb=sb;
  332. }
  333. #ifdef MONITOR
  334. main_status[cpu] = MAIN_RUNNING2;
  335. #endif
  336. if (queue -> mode & BLAS_LEGACY) {
  337. legacy_exec(routine, queue -> mode, queue -> args, sb);
  338. } else
  339. if (queue -> mode & BLAS_PTHREAD) {
  340. void (*pthreadcompat)(void *) = queue -> routine;
  341. (pthreadcompat)(queue -> args);
  342. } else
  343. (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
  344. #ifdef SMP_DEBUG
  345. fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
  346. #endif
  347. #ifdef MONITOR
  348. main_status[cpu] = MAIN_FINISH;
  349. #endif
  350. thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
  351. WMB;
  352. }
  353. #ifdef MONITOR
  354. main_status[cpu] = MAIN_DONE;
  355. #endif
  356. #ifdef TIMING_DEBUG
  357. stop = rpcc();
  358. fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
  359. start, stop,
  360. stop - start);
  361. #endif
  362. }
  363. /* Shutdown procedure */
  364. #ifdef SMP_DEBUG
  365. fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
  366. #endif
  367. blas_memory_free(buffer);
  368. //pthread_exit(NULL);
  369. return 0;
  370. }
  371. #ifdef MONITOR
  372. static BLASLONG num_suspend = 0;
  373. static int blas_monitor(void *arg){
  374. int i;
  375. while(1){
  376. for (i = 0; i < blas_num_threads - 1; i++){
  377. switch (main_status[i]) {
  378. case MAIN_ENTER :
  379. fprintf(STDERR, "THREAD[%2d] : Entering.\n", i);
  380. break;
  381. case MAIN_EXIT :
  382. fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i);
  383. break;
  384. case MAIN_TRYLOCK :
  385. fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i);
  386. break;
  387. case MAIN_QUEUING :
  388. fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i);
  389. break;
  390. case MAIN_RECEIVING :
  391. fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i);
  392. break;
  393. case MAIN_RUNNING1 :
  394. fprintf(STDERR, "THREAD[%2d] : Running1.\n", i);
  395. break;
  396. case MAIN_RUNNING2 :
  397. fprintf(STDERR, "THREAD[%2d] : Running2.\n", i);
  398. break;
  399. case MAIN_RUNNING3 :
  400. fprintf(STDERR, "THREAD[%2d] : Running3.\n", i);
  401. break;
  402. case MAIN_WAITING :
  403. fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i);
  404. break;
  405. case MAIN_SLEEPING :
  406. fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i);
  407. break;
  408. case MAIN_FINISH :
  409. fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i);
  410. break;
  411. case MAIN_DONE :
  412. fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i);
  413. break;
  414. }
  415. fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend);
  416. }
  417. sleep(1);
  418. }
  419. return 0;
  420. }
  421. #endif
  422. /* Initializing routine */
  423. int blas_thread_init(void){
  424. BLASLONG i;
  425. int ret;
  426. #ifdef NEED_STACKATTR
  427. pthread_attr_t attr;
  428. #endif
  429. if (blas_server_avail) return 0;
  430. #ifdef NEED_STACKATTR
  431. pthread_attr_init(&attr);
  432. pthread_attr_setguardsize(&attr, 0x1000U);
  433. pthread_attr_setstacksize( &attr, 0x1000U);
  434. #endif
  435. LOCK_COMMAND(&server_lock);
  436. if (!blas_server_avail){
  437. env_var_t p;
  438. if (readenv(p,"THREAD_TIMEOUT")) {
  439. thread_timeout = atoi(p);
  440. if (thread_timeout < 4) thread_timeout = 4;
  441. if (thread_timeout > 30) thread_timeout = 30;
  442. thread_timeout = (1 << thread_timeout);
  443. }else{
  444. if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
  445. thread_timeout = atoi(p);
  446. if (thread_timeout < 4) thread_timeout = 4;
  447. if (thread_timeout > 30) thread_timeout = 30;
  448. thread_timeout = (1 << thread_timeout);
  449. }
  450. }
  451. for(i = 0; i < blas_num_threads - 1; i++){
  452. thread_status[i].queue = (blas_queue_t *)NULL;
  453. thread_status[i].status = THREAD_STATUS_WAKEUP;
  454. pthread_mutex_init(&thread_status[i].lock, NULL);
  455. pthread_cond_init (&thread_status[i].wakeup, NULL);
  456. #ifdef NEED_STACKATTR
  457. ret=pthread_create(&blas_threads[i], &attr,
  458. (void *)&blas_thread_server, (void *)i);
  459. #else
  460. ret=pthread_create(&blas_threads[i], NULL,
  461. (void *)&blas_thread_server, (void *)i);
  462. #endif
  463. if(ret!=0){
  464. fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
  465. exit(1);
  466. }
  467. }
  468. #ifdef MONITOR
  469. pthread_create(&monitor_thread, NULL,
  470. (void *)&blas_monitor, (void *)NULL);
  471. #endif
  472. blas_server_avail = 1;
  473. }
  474. UNLOCK_COMMAND(&server_lock);
  475. return 0;
  476. }
  477. /*
  478. User can call one of two routines.
  479. exec_blas_async ... immediately returns after jobs are queued.
  480. exec_blas ... returns after jobs are finished.
  481. */
  482. static BLASULONG exec_queue_lock = 0;
  483. int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
  484. #ifdef SMP_SERVER
  485. // Handle lazy re-init of the thread-pool after a POSIX fork
  486. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  487. #endif
  488. BLASLONG i = 0;
  489. blas_queue_t *current = queue;
  490. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  491. int node = get_node();
  492. int nodes = get_num_nodes();
  493. #endif
  494. #ifdef SMP_DEBUG
  495. int exec_count = 0;
  496. fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
  497. #endif
  498. blas_lock(&exec_queue_lock);
  499. while (queue) {
  500. queue -> position = pos;
  501. #ifdef CONSISTENT_FPCSR
  502. __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
  503. __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
  504. #endif
  505. #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
  506. /* Node Mapping Mode */
  507. if (queue -> mode & BLAS_NODE) {
  508. do {
  509. while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++;
  510. if (i < blas_num_threads - 1) break;
  511. i ++;
  512. if (i >= blas_num_threads - 1) {
  513. i = 0;
  514. node ++;
  515. if (node >= nodes) node = 0;
  516. }
  517. } while (1);
  518. } else {
  519. while(thread_status[i].queue) {
  520. i ++;
  521. if (i >= blas_num_threads - 1) i = 0;
  522. }
  523. }
  524. #else
  525. while(thread_status[i].queue) {
  526. i ++;
  527. if (i >= blas_num_threads - 1) i = 0;
  528. }
  529. #endif
  530. queue -> assigned = i;
  531. WMB;
  532. thread_status[i].queue = queue;
  533. WMB;
  534. queue = queue -> next;
  535. pos ++;
  536. #ifdef SMP_DEBUG
  537. exec_count ++;
  538. #endif
  539. }
  540. blas_unlock(&exec_queue_lock);
  541. #ifdef SMP_DEBUG
  542. fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
  543. #endif
  544. while (current) {
  545. pos = current -> assigned;
  546. if ((BLASULONG)thread_status[pos].queue > 1) {
  547. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  548. pthread_mutex_lock (&thread_status[pos].lock);
  549. #ifdef MONITOR
  550. num_suspend ++;
  551. #endif
  552. if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
  553. thread_status[pos].status = THREAD_STATUS_WAKEUP;
  554. pthread_cond_signal(&thread_status[pos].wakeup);
  555. }
  556. pthread_mutex_unlock(&thread_status[pos].lock);
  557. }
  558. }
  559. current = current -> next;
  560. }
  561. return 0;
  562. }
  563. int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
  564. while ((num > 0) && queue) {
  565. while(thread_status[queue -> assigned].queue) {
  566. YIELDING;
  567. };
  568. queue = queue -> next;
  569. num --;
  570. }
  571. #ifdef SMP_DEBUG
  572. fprintf(STDERR, "Done.\n\n");
  573. #endif
  574. return 0;
  575. }
  576. /* Execute Threads */
  577. int exec_blas(BLASLONG num, blas_queue_t *queue){
  578. #ifdef SMP_SERVER
  579. // Handle lazy re-init of the thread-pool after a POSIX fork
  580. if (unlikely(blas_server_avail == 0)) blas_thread_init();
  581. #endif
  582. int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
  583. #ifdef TIMING_DEBUG
  584. BLASULONG start, stop;
  585. #endif
  586. if ((num <= 0) || (queue == NULL)) return 0;
  587. #ifdef SMP_DEBUG
  588. fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
  589. #endif
  590. #ifdef __ELF__
  591. if (omp_in_parallel && (num > 1)) {
  592. if (omp_in_parallel() > 0) {
  593. fprintf(stderr,
  594. "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
  595. "Please rebuild the library with USE_OPENMP=1 option.\n");
  596. }
  597. }
  598. #endif
  599. if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
  600. #ifdef TIMING_DEBUG
  601. start = rpcc();
  602. fprintf(STDERR, "\n");
  603. #endif
  604. routine = queue -> routine;
  605. if (queue -> mode & BLAS_LEGACY) {
  606. legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
  607. } else
  608. if (queue -> mode & BLAS_PTHREAD) {
  609. void (*pthreadcompat)(void *) = queue -> routine;
  610. (pthreadcompat)(queue -> args);
  611. } else
  612. (routine)(queue -> args, queue -> range_m, queue -> range_n,
  613. queue -> sa, queue -> sb, 0);
  614. #ifdef TIMING_DEBUG
  615. stop = rpcc();
  616. #endif
  617. if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
  618. #ifdef TIMING_DEBUG
  619. fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n",
  620. start, stop,
  621. stop - start);
  622. #endif
  623. return 0;
  624. }
  625. void goto_set_num_threads(int num_threads) {
  626. long i;
  627. if (num_threads < 1) num_threads = blas_num_threads;
  628. #ifndef NO_AFFINITY
  629. if (num_threads == 1) {
  630. if (blas_cpu_number == 1){
  631. //OpenBLAS is already single thread.
  632. return;
  633. }else{
  634. //From multi-threads to single thread
  635. //Restore the original affinity mask
  636. gotoblas_set_affinity(-1);
  637. }
  638. }
  639. #endif
  640. if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
  641. if (num_threads > blas_num_threads) {
  642. LOCK_COMMAND(&server_lock);
  643. increased_threads = 1;
  644. for(i = blas_num_threads - 1; i < num_threads - 1; i++){
  645. thread_status[i].queue = (blas_queue_t *)NULL;
  646. thread_status[i].status = THREAD_STATUS_WAKEUP;
  647. pthread_mutex_init(&thread_status[i].lock, NULL);
  648. pthread_cond_init (&thread_status[i].wakeup, NULL);
  649. #ifdef NEED_STACKATTR
  650. pthread_create(&blas_threads[i], &attr,
  651. (void *)&blas_thread_server, (void *)i);
  652. #else
  653. pthread_create(&blas_threads[i], NULL,
  654. (void *)&blas_thread_server, (void *)i);
  655. #endif
  656. }
  657. blas_num_threads = num_threads;
  658. UNLOCK_COMMAND(&server_lock);
  659. }
  660. #ifndef NO_AFFINITY
  661. if(blas_cpu_number == 1 && num_threads > 1){
  662. //Restore the thread 0 affinity.
  663. gotoblas_set_affinity(0);
  664. }
  665. #endif
  666. blas_cpu_number = num_threads;
  667. #if defined(ARCH_MIPS64)
  668. //set parameters for different number of threads.
  669. blas_set_parameter();
  670. #endif
  671. }
  672. void openblas_set_num_threads(int num_threads) {
  673. goto_set_num_threads(num_threads);
  674. }
  675. /* Compatible function with pthread_create / join */
  676. int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
  677. blas_queue_t queue[MAX_CPU_NUMBER];
  678. int i;
  679. if (numthreads <= 0) return 0;
  680. #ifdef SMP
  681. if (blas_cpu_number == 0) blas_get_cpu_number();
  682. #ifdef SMP_SERVER
  683. if (blas_server_avail == 0) blas_thread_init();
  684. #endif
  685. #endif
  686. for (i = 0; i < numthreads; i ++) {
  687. queue[i].mode = BLAS_PTHREAD;
  688. queue[i].routine = function;
  689. queue[i].args = args;
  690. queue[i].range_m = NULL;
  691. queue[i].range_n = NULL;
  692. queue[i].sa = args;
  693. queue[i].sb = args;
  694. queue[i].next = &queue[i + 1];
  695. args += stride;
  696. }
  697. queue[numthreads - 1].next = NULL;
  698. exec_blas(numthreads, queue);
  699. return 0;
  700. }
  701. /* Shutdown procedure, but user don't have to call this routine. The */
  702. /* kernel automatically kill threads. */
  703. int BLASFUNC(blas_thread_shutdown)(void){
  704. int i;
  705. if (!blas_server_avail) return 0;
  706. LOCK_COMMAND(&server_lock);
  707. for (i = 0; i < blas_num_threads - 1; i++) {
  708. blas_lock(&exec_queue_lock);
  709. thread_status[i].queue = (blas_queue_t *)-1;
  710. blas_unlock(&exec_queue_lock);
  711. pthread_mutex_lock (&thread_status[i].lock);
  712. thread_status[i].status = THREAD_STATUS_WAKEUP;
  713. pthread_cond_signal (&thread_status[i].wakeup);
  714. pthread_mutex_unlock(&thread_status[i].lock);
  715. }
  716. for(i = 0; i < blas_num_threads - 1; i++){
  717. pthread_join(blas_threads[i], NULL);
  718. }
  719. for(i = 0; i < blas_num_threads - 1; i++){
  720. pthread_mutex_destroy(&thread_status[i].lock);
  721. pthread_cond_destroy (&thread_status[i].wakeup);
  722. }
  723. #ifdef NEED_STACKATTR
  724. pthread_attr_destory(&attr);
  725. #endif
  726. blas_server_avail = 0;
  727. UNLOCK_COMMAND(&server_lock);
  728. return 0;
  729. }
  730. #endif