You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371
  1. /*****************************************************************************
  2. Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************/
  28. /* Copyright 2009, 2010 The University of Texas at Austin. */
  29. /* All rights reserved. */
  30. /* */
  31. /* Redistribution and use in source and binary forms, with or */
  32. /* without modification, are permitted provided that the following */
  33. /* conditions are met: */
  34. /* */
  35. /* 1. Redistributions of source code must retain the above */
  36. /* copyright notice, this list of conditions and the following */
  37. /* disclaimer. */
  38. /* */
  39. /* 2. Redistributions in binary form must reproduce the above */
  40. /* copyright notice, this list of conditions and the following */
  41. /* disclaimer in the documentation and/or other materials */
  42. /* provided with the distribution. */
  43. /* */
  44. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  45. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  46. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  47. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  48. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  49. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  50. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  51. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  52. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  53. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  54. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  55. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  56. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  57. /* POSSIBILITY OF SUCH DAMAGE. */
  58. /* */
  59. /* The views and conclusions contained in the software and */
  60. /* documentation are those of the authors and should not be */
  61. /* interpreted as representing official policies, either expressed */
  62. /* or implied, of The University of Texas at Austin. */
  63. /*********************************************************************/
  64. //#undef DEBUG
  65. #include "common.h"
  66. #include <errno.h>
  67. #ifdef OS_WINDOWS
  68. #define ALLOC_WINDOWS
  69. #ifndef MEM_LARGE_PAGES
  70. #define MEM_LARGE_PAGES 0x20000000
  71. #endif
  72. #else
  73. #define ALLOC_MMAP
  74. #define ALLOC_MALLOC
  75. #endif
  76. #include <stdlib.h>
  77. #include <stdio.h>
  78. #include <fcntl.h>
  79. #ifndef OS_WINDOWS
  80. #include <sys/mman.h>
  81. #include <sys/shm.h>
  82. #include <sys/ipc.h>
  83. #endif
  84. #include <sys/types.h>
  85. #ifdef OS_LINUX
  86. #include <sys/sysinfo.h>
  87. #include <sched.h>
  88. #include <errno.h>
  89. #include <linux/unistd.h>
  90. #include <sys/syscall.h>
  91. #endif
  92. #if defined(OS_FREEBSD) || defined(OS_DARWIN)
  93. #include <sys/sysctl.h>
  94. #include <sys/resource.h>
  95. #endif
  96. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  97. #include <conio.h>
  98. #undef printf
  99. #define printf _cprintf
  100. #endif
  101. #ifdef OS_LINUX
  102. #ifndef MPOL_PREFERRED
  103. #define MPOL_PREFERRED 1
  104. #endif
  105. #endif
  106. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  107. #define NO_WARMUP
  108. #endif
  109. #ifndef SHM_HUGETLB
  110. #define SHM_HUGETLB 04000
  111. #endif
  112. #ifndef FIXED_PAGESIZE
  113. #define FIXED_PAGESIZE 4096
  114. #endif
  115. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  116. #define CONSTRUCTOR __attribute__ ((constructor))
  117. #define DESTRUCTOR __attribute__ ((destructor))
  118. #ifdef DYNAMIC_ARCH
  119. gotoblas_t *gotoblas = NULL;
  120. #endif
  121. extern void openblas_warning(int verbose, const char * msg);
  122. #ifndef SMP
  123. #define blas_cpu_number 1
  124. #define blas_num_threads 1
  125. /* Dummy Function */
  126. int goto_get_num_procs (void) { return 1;};
  127. void goto_set_num_threads(int num_threads) {};
  128. #else
  129. #ifdef OS_LINUX
  130. #ifndef NO_AFFINITY
  131. int get_num_procs(void);
  132. #else
  133. int get_num_procs(void) {
  134. static int nums = 0;
  135. if (!nums) nums = get_nprocs();
  136. return nums;
  137. }
  138. #endif
  139. #endif
  140. #ifdef OS_WINDOWS
  141. int get_num_procs(void) {
  142. static int nums = 0;
  143. if (nums == 0) {
  144. SYSTEM_INFO sysinfo;
  145. GetSystemInfo(&sysinfo);
  146. nums = sysinfo.dwNumberOfProcessors;
  147. }
  148. return nums;
  149. }
  150. #endif
  151. #if defined(OS_FREEBSD)
  152. int get_num_procs(void) {
  153. static int nums = 0;
  154. int m[2];
  155. size_t len;
  156. if (nums == 0) {
  157. m[0] = CTL_HW;
  158. m[1] = HW_NCPU;
  159. len = sizeof(int);
  160. sysctl(m, 2, &nums, &len, NULL, 0);
  161. }
  162. return nums;
  163. }
  164. #endif
  165. #if defined(OS_DARWIN)
  166. int get_num_procs(void) {
  167. static int nums = 0;
  168. size_t len;
  169. if (nums == 0){
  170. len = sizeof(int);
  171. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  172. }
  173. return nums;
  174. }
  175. /*
  176. void set_stack_limit(int limitMB){
  177. int result=0;
  178. struct rlimit rl;
  179. rlim_t StackSize;
  180. StackSize=limitMB*1024*1024;
  181. result=getrlimit(RLIMIT_STACK, &rl);
  182. if(result==0){
  183. if(rl.rlim_cur < StackSize){
  184. rl.rlim_cur=StackSize;
  185. result=setrlimit(RLIMIT_STACK, &rl);
  186. if(result !=0){
  187. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  188. }
  189. }
  190. }
  191. }
  192. */
  193. #endif
  194. /*
  195. OpenBLAS uses the numbers of CPU cores in multithreading.
  196. It can be set by openblas_set_num_threads(int num_threads);
  197. */
  198. int blas_cpu_number = 0;
  199. /*
  200. The numbers of threads in the thread pool.
  201. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  202. */
  203. int blas_num_threads = 0;
  204. int goto_get_num_procs (void) {
  205. return blas_cpu_number;
  206. }
  207. void openblas_fork_handler()
  208. {
  209. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  210. // built with "make USE_OPENMP=0".
  211. // Hanging can still happen when OpenBLAS is built against the libgomp
  212. // implementation of OpenMP. The problem is tracked at:
  213. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  214. // In the mean time build with USE_OPENMP=0 or link against another
  215. // implementation of OpenMP.
  216. int err;
  217. err = pthread_atfork (BLASFUNC(blas_thread_shutdown), NULL, NULL);
  218. if(err != 0)
  219. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  220. }
  221. int blas_get_cpu_number(void){
  222. char *p;
  223. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  224. int max_num;
  225. #endif
  226. int blas_goto_num = 0;
  227. int blas_omp_num = 0;
  228. if (blas_num_threads) return blas_num_threads;
  229. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  230. max_num = get_num_procs();
  231. #endif
  232. blas_goto_num = 0;
  233. #ifndef USE_OPENMP
  234. p = getenv("OPENBLAS_NUM_THREADS");
  235. if (p) blas_goto_num = atoi(p);
  236. if (blas_goto_num < 0) blas_goto_num = 0;
  237. if (blas_goto_num == 0) {
  238. p = getenv("GOTO_NUM_THREADS");
  239. if (p) blas_goto_num = atoi(p);
  240. if (blas_goto_num < 0) blas_goto_num = 0;
  241. }
  242. #endif
  243. blas_omp_num = 0;
  244. p = getenv("OMP_NUM_THREADS");
  245. if (p) blas_omp_num = atoi(p);
  246. if (blas_omp_num < 0) blas_omp_num = 0;
  247. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  248. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  249. else blas_num_threads = MAX_CPU_NUMBER;
  250. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  251. if (blas_num_threads > max_num) blas_num_threads = max_num;
  252. #endif
  253. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  254. #ifdef DEBUG
  255. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  256. #endif
  257. blas_cpu_number = blas_num_threads;
  258. return blas_num_threads;
  259. }
  260. #endif
  261. struct release_t {
  262. void *address;
  263. void (*func)(struct release_t *);
  264. long attr;
  265. };
  266. int hugetlb_allocated = 0;
  267. static struct release_t release_info[NUM_BUFFERS];
  268. static int release_pos = 0;
  269. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  270. static int hot_alloc = 0;
  271. #endif
  272. #ifdef ALLOC_MMAP
  273. static void alloc_mmap_free(struct release_t *release){
  274. if (munmap(release -> address, BUFFER_SIZE)) {
  275. printf("OpenBLAS : munmap failed\n");
  276. }
  277. }
  278. #ifdef NO_WARMUP
  279. static void *alloc_mmap(void *address){
  280. void *map_address;
  281. if (address){
  282. map_address = mmap(address,
  283. BUFFER_SIZE,
  284. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  285. } else {
  286. map_address = mmap(address,
  287. BUFFER_SIZE,
  288. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  289. }
  290. if (map_address != (void *)-1) {
  291. release_info[release_pos].address = map_address;
  292. release_info[release_pos].func = alloc_mmap_free;
  293. release_pos ++;
  294. }
  295. #ifdef OS_LINUX
  296. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  297. #endif
  298. return map_address;
  299. }
  300. #else
  301. #define BENCH_ITERATION 4
  302. #define SCALING 2
  303. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  304. BLASULONG original, *p;
  305. BLASULONG start, stop, min;
  306. int iter, i, count;
  307. min = (BLASULONG)-1;
  308. original = *(BLASULONG *)(address + size - PAGESIZE);
  309. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  310. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  311. p = (BLASULONG *)address;
  312. count = size / PAGESIZE;
  313. start = rpcc();
  314. for (i = 0; i < count; i ++) {
  315. p = (BLASULONG *)(*p);
  316. }
  317. stop = rpcc();
  318. if (min > stop - start) min = stop - start;
  319. }
  320. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  321. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  322. return min;
  323. }
  324. static void *alloc_mmap(void *address){
  325. void *map_address, *best_address;
  326. BLASULONG best, start, current;
  327. BLASULONG allocsize;
  328. if (address){
  329. /* Just give up use advanced operation */
  330. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  331. #ifdef OS_LINUX
  332. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  333. #endif
  334. } else {
  335. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  336. if (hot_alloc == 0) {
  337. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  338. #ifdef OS_LINUX
  339. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  340. #endif
  341. } else {
  342. #endif
  343. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  344. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  345. if (map_address != (void *)-1) {
  346. #ifdef OS_LINUX
  347. #ifdef DEBUG
  348. int ret=0;
  349. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  350. if(ret==-1){
  351. int errsv=errno;
  352. perror("OpenBLAS alloc_mmap:");
  353. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  354. }
  355. #else
  356. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  357. #endif
  358. #endif
  359. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  360. start = (BLASULONG)map_address;
  361. current = (SCALING - 1) * BUFFER_SIZE;
  362. while(current > 0) {
  363. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  364. start += PAGESIZE;
  365. current -= PAGESIZE;
  366. }
  367. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  368. start = (BLASULONG)map_address;
  369. best = (BLASULONG)-1;
  370. best_address = map_address;
  371. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  372. current = run_bench(start, allocsize);
  373. if (best > current) {
  374. best = current;
  375. best_address = (void *)start;
  376. }
  377. start += PAGESIZE;
  378. }
  379. if ((BLASULONG)best_address > (BLASULONG)map_address)
  380. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  381. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  382. map_address = best_address;
  383. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  384. hot_alloc = 2;
  385. #endif
  386. }
  387. }
  388. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  389. }
  390. #endif
  391. if (map_address != (void *)-1) {
  392. release_info[release_pos].address = map_address;
  393. release_info[release_pos].func = alloc_mmap_free;
  394. release_pos ++;
  395. }
  396. return map_address;
  397. }
  398. #endif
  399. #endif
  400. #ifdef ALLOC_MALLOC
  401. static void alloc_malloc_free(struct release_t *release){
  402. free(release -> address);
  403. }
  404. static void *alloc_malloc(void *address){
  405. void *map_address;
  406. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  407. if (map_address == (void *)NULL) map_address = (void *)-1;
  408. if (map_address != (void *)-1) {
  409. release_info[release_pos].address = map_address;
  410. release_info[release_pos].func = alloc_malloc_free;
  411. release_pos ++;
  412. }
  413. return map_address;
  414. }
  415. #endif
  416. #ifdef ALLOC_QALLOC
  417. void *qalloc(int flags, size_t bytes);
  418. void *qfree (void *address);
  419. #define QNONCACHE 0x1
  420. #define QCOMMS 0x2
  421. #define QFAST 0x4
  422. static void alloc_qalloc_free(struct release_t *release){
  423. qfree(release -> address);
  424. }
  425. static void *alloc_qalloc(void *address){
  426. void *map_address;
  427. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  428. if (map_address == (void *)NULL) map_address = (void *)-1;
  429. if (map_address != (void *)-1) {
  430. release_info[release_pos].address = map_address;
  431. release_info[release_pos].func = alloc_qalloc_free;
  432. release_pos ++;
  433. }
  434. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  435. }
  436. #endif
  437. #ifdef ALLOC_WINDOWS
  438. static void alloc_windows_free(struct release_t *release){
  439. VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
  440. }
  441. static void *alloc_windows(void *address){
  442. void *map_address;
  443. map_address = VirtualAlloc(address,
  444. BUFFER_SIZE,
  445. MEM_RESERVE | MEM_COMMIT,
  446. PAGE_READWRITE);
  447. if (map_address == (void *)NULL) map_address = (void *)-1;
  448. if (map_address != (void *)-1) {
  449. release_info[release_pos].address = map_address;
  450. release_info[release_pos].func = alloc_windows_free;
  451. release_pos ++;
  452. }
  453. return map_address;
  454. }
  455. #endif
  456. #ifdef ALLOC_DEVICEDRIVER
  457. #ifndef DEVICEDRIVER_NAME
  458. #define DEVICEDRIVER_NAME "/dev/mapper"
  459. #endif
  460. static void alloc_devicedirver_free(struct release_t *release){
  461. if (munmap(release -> address, BUFFER_SIZE)) {
  462. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  463. }
  464. if (close(release -> attr)) {
  465. printf("OpenBLAS : Bugphysarea close failed.\n");
  466. }
  467. }
  468. static void *alloc_devicedirver(void *address){
  469. int fd;
  470. void *map_address;
  471. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  472. return (void *)-1;
  473. }
  474. map_address = mmap(address, BUFFER_SIZE,
  475. PROT_READ | PROT_WRITE,
  476. MAP_FILE | MAP_SHARED,
  477. fd, 0);
  478. if (map_address != (void *)-1) {
  479. release_info[release_pos].address = map_address;
  480. release_info[release_pos].attr = fd;
  481. release_info[release_pos].func = alloc_devicedirver_free;
  482. release_pos ++;
  483. }
  484. return map_address;
  485. }
  486. #endif
  487. #ifdef ALLOC_SHM
  488. static void alloc_shm_free(struct release_t *release){
  489. if (shmdt(release -> address)) {
  490. printf("OpenBLAS : Shared memory unmap failed.\n");
  491. }
  492. }
  493. static void *alloc_shm(void *address){
  494. void *map_address;
  495. int shmid;
  496. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  497. map_address = (void *)shmat(shmid, address, 0);
  498. if (map_address != (void *)-1){
  499. #ifdef OS_LINUX
  500. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  501. #endif
  502. shmctl(shmid, IPC_RMID, 0);
  503. release_info[release_pos].address = map_address;
  504. release_info[release_pos].attr = shmid;
  505. release_info[release_pos].func = alloc_shm_free;
  506. release_pos ++;
  507. }
  508. return map_address;
  509. }
  510. #endif
  511. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  512. static void alloc_hugetlb_free(struct release_t *release){
  513. #if defined(OS_LINUX) || defined(OS_AIX)
  514. if (shmdt(release -> address)) {
  515. printf("OpenBLAS : Hugepage unmap failed.\n");
  516. }
  517. #endif
  518. #ifdef __sun__
  519. munmap(release -> address, BUFFER_SIZE);
  520. #endif
  521. #ifdef OS_WINDOWS
  522. VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
  523. #endif
  524. }
  525. static void *alloc_hugetlb(void *address){
  526. void *map_address = (void *)-1;
  527. #if defined(OS_LINUX) || defined(OS_AIX)
  528. int shmid;
  529. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  530. #ifdef OS_LINUX
  531. SHM_HUGETLB |
  532. #endif
  533. #ifdef OS_AIX
  534. SHM_LGPAGE | SHM_PIN |
  535. #endif
  536. IPC_CREAT | SHM_R | SHM_W);
  537. if (shmid != -1) {
  538. map_address = (void *)shmat(shmid, address, SHM_RND);
  539. #ifdef OS_LINUX
  540. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  541. #endif
  542. if (map_address != (void *)-1){
  543. shmctl(shmid, IPC_RMID, 0);
  544. }
  545. }
  546. #endif
  547. #ifdef __sun__
  548. struct memcntl_mha mha;
  549. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  550. mha.mha_flags = 0;
  551. mha.mha_pagesize = HUGE_PAGESIZE;
  552. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  553. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  554. #endif
  555. #ifdef OS_WINDOWS
  556. HANDLE hToken;
  557. TOKEN_PRIVILEGES tp;
  558. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  559. tp.PrivilegeCount = 1;
  560. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  561. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1;
  562. if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1;
  563. map_address = (void *)VirtualAlloc(address,
  564. BUFFER_SIZE,
  565. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  566. PAGE_READWRITE);
  567. AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL);
  568. if (map_address == (void *)NULL) map_address = (void *)-1;
  569. #endif
  570. if (map_address != (void *)-1){
  571. release_info[release_pos].address = map_address;
  572. release_info[release_pos].func = alloc_hugetlb_free;
  573. release_pos ++;
  574. }
  575. return map_address;
  576. }
  577. #endif
  578. #ifdef ALLOC_HUGETLBFILE
  579. static int hugetlb_pid = 0;
  580. static void alloc_hugetlbfile_free(struct release_t *release){
  581. if (munmap(release -> address, BUFFER_SIZE)) {
  582. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  583. }
  584. if (close(release -> attr)) {
  585. printf("OpenBLAS : HugeTLBfs close failed.\n");
  586. }
  587. }
  588. static void *alloc_hugetlbfile(void *address){
  589. void *map_address = (void *)-1;
  590. int fd;
  591. char filename[64];
  592. if (!hugetlb_pid) hugetlb_pid = getpid();
  593. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  594. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  595. return (void *)-1;
  596. }
  597. unlink(filename);
  598. map_address = mmap(address, BUFFER_SIZE,
  599. PROT_READ | PROT_WRITE,
  600. MAP_SHARED,
  601. fd, 0);
  602. if (map_address != (void *)-1) {
  603. release_info[release_pos].address = map_address;
  604. release_info[release_pos].attr = fd;
  605. release_info[release_pos].func = alloc_hugetlbfile_free;
  606. release_pos ++;
  607. }
  608. return map_address;
  609. }
  610. #endif
  611. /* Global lock for memory allocation */
  612. #if defined(USE_PTHREAD_LOCK)
  613. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  614. #elif defined(USE_PTHREAD_SPINLOCK)
  615. static pthread_spinlock_t alloc_lock = 0;
  616. #else
  617. static BLASULONG alloc_lock = 0UL;
  618. #endif
  619. #ifdef SEEK_ADDRESS
  620. static BLASULONG base_address = 0UL;
  621. #else
  622. static BLASULONG base_address = BASE_ADDRESS;
  623. #endif
  624. static volatile struct {
  625. BLASULONG lock;
  626. void *addr;
  627. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  628. int pos;
  629. #endif
  630. int used;
  631. #ifndef __64BIT__
  632. char dummy[48];
  633. #else
  634. char dummy[40];
  635. #endif
  636. } memory[NUM_BUFFERS];
  637. static int memory_initialized = 0;
  638. static void gotoblas_memory_init(void);
  639. /* Memory allocation routine */
  640. /* procpos ... indicates where it comes from */
  641. /* 0 : Level 3 functions */
  642. /* 1 : Level 2 functions */
  643. /* 2 : Thread */
  644. void *blas_memory_alloc(int procpos){
  645. int position;
  646. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  647. int mypos;
  648. #endif
  649. void *map_address;
  650. void *(*memoryalloc[])(void *address) = {
  651. #ifdef ALLOC_DEVICEDRIVER
  652. alloc_devicedirver,
  653. #endif
  654. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  655. alloc_hugetlb,
  656. #endif
  657. #ifdef ALLOC_SHM
  658. alloc_shm,
  659. #endif
  660. #ifdef ALLOC_MMAP
  661. alloc_mmap,
  662. #endif
  663. #ifdef ALLOC_QALLOC
  664. alloc_qalloc,
  665. #endif
  666. #ifdef ALLOC_WINDOWS
  667. alloc_windows,
  668. #endif
  669. #ifdef ALLOC_MALLOC
  670. alloc_malloc,
  671. #endif
  672. NULL,
  673. };
  674. void *(**func)(void *address);
  675. if (!memory_initialized) {
  676. LOCK_COMMAND(&alloc_lock);
  677. if (!memory_initialized) {
  678. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  679. for (position = 0; position < NUM_BUFFERS; position ++){
  680. memory[position].addr = (void *)0;
  681. memory[position].pos = -1;
  682. memory[position].used = 0;
  683. memory[position].lock = 0;
  684. }
  685. #endif
  686. #ifdef DYNAMIC_ARCH
  687. gotoblas_dynamic_init();
  688. #endif
  689. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  690. gotoblas_affinity_init();
  691. #endif
  692. #ifdef SMP
  693. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  694. #endif
  695. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
  696. #ifndef DYNAMIC_ARCH
  697. blas_set_parameter();
  698. #endif
  699. #endif
  700. memory_initialized = 1;
  701. }
  702. UNLOCK_COMMAND(&alloc_lock);
  703. }
  704. #ifdef DEBUG
  705. printf("Alloc Start ...\n");
  706. #endif
  707. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  708. mypos = WhereAmI();
  709. position = mypos;
  710. while (position > NUM_BUFFERS) position >>= 1;
  711. do {
  712. if (!memory[position].used && (memory[position].pos == mypos)) {
  713. blas_lock(&memory[position].lock);
  714. if (!memory[position].used) goto allocation;
  715. blas_unlock(&memory[position].lock);
  716. }
  717. position ++;
  718. } while (position < NUM_BUFFERS);
  719. #endif
  720. position = 0;
  721. do {
  722. if (!memory[position].used) {
  723. blas_lock(&memory[position].lock);
  724. if (!memory[position].used) goto allocation;
  725. blas_unlock(&memory[position].lock);
  726. }
  727. position ++;
  728. } while (position < NUM_BUFFERS);
  729. goto error;
  730. allocation :
  731. #ifdef DEBUG
  732. printf(" Position -> %d\n", position);
  733. #endif
  734. memory[position].used = 1;
  735. blas_unlock(&memory[position].lock);
  736. if (!memory[position].addr) {
  737. do {
  738. #ifdef DEBUG
  739. printf("Allocation Start : %lx\n", base_address);
  740. #endif
  741. map_address = (void *)-1;
  742. func = &memoryalloc[0];
  743. while ((func != NULL) && (map_address == (void *) -1)) {
  744. map_address = (*func)((void *)base_address);
  745. #ifdef ALLOC_DEVICEDRIVER
  746. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  747. fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
  748. }
  749. #endif
  750. #ifdef ALLOC_HUGETLBFILE
  751. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  752. #ifndef OS_WINDOWS
  753. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  754. #endif
  755. }
  756. #endif
  757. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  758. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  759. #endif
  760. func ++;
  761. }
  762. #ifdef DEBUG
  763. printf(" Success -> %08lx\n", map_address);
  764. #endif
  765. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  766. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  767. } while ((BLASLONG)map_address == -1);
  768. memory[position].addr = map_address;
  769. #ifdef DEBUG
  770. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  771. #endif
  772. }
  773. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  774. if (memory[position].pos == -1) memory[position].pos = mypos;
  775. #endif
  776. #ifdef DYNAMIC_ARCH
  777. if (memory_initialized == 1) {
  778. LOCK_COMMAND(&alloc_lock);
  779. if (memory_initialized == 1) {
  780. if (!gotoblas) gotoblas_dynamic_init();
  781. memory_initialized = 2;
  782. }
  783. UNLOCK_COMMAND(&alloc_lock);
  784. }
  785. #endif
  786. #ifdef DEBUG
  787. printf("Mapped : %p %3d\n\n",
  788. (void *)memory[position].addr, position);
  789. #endif
  790. return (void *)memory[position].addr;
  791. error:
  792. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  793. return NULL;
  794. }
  795. void blas_memory_free(void *free_area){
  796. int position;
  797. #ifdef DEBUG
  798. printf("Unmapped Start : %p ...\n", free_area);
  799. #endif
  800. position = 0;
  801. while ((memory[position].addr != free_area)
  802. && (position < NUM_BUFFERS)) position++;
  803. if (memory[position].addr != free_area) goto error;
  804. #ifdef DEBUG
  805. printf(" Position : %d\n", position);
  806. #endif
  807. memory[position].used = 0;
  808. #ifdef DEBUG
  809. printf("Unmap Succeeded.\n\n");
  810. #endif
  811. return;
  812. error:
  813. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  814. #ifdef DEBUG
  815. for (position = 0; position < NUM_BUFFERS; position++)
  816. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  817. #endif
  818. return;
  819. }
  820. void blas_shutdown(void){
  821. int pos;
  822. #ifdef SMP
  823. BLASFUNC(blas_thread_shutdown)();
  824. #endif
  825. LOCK_COMMAND(&alloc_lock);
  826. for (pos = 0; pos < release_pos; pos ++) {
  827. release_info[pos].func(&release_info[pos]);
  828. }
  829. #ifdef SEEK_ADDRESS
  830. base_address = 0UL;
  831. #else
  832. base_address = BASE_ADDRESS;
  833. #endif
  834. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  835. memory[pos].addr = (void *)0;
  836. memory[pos].used = 0;
  837. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  838. memory[pos].pos = -1;
  839. #endif
  840. memory[pos].lock = 0;
  841. }
  842. UNLOCK_COMMAND(&alloc_lock);
  843. return;
  844. }
  845. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  846. #ifdef SMP
  847. #if defined(USE_PTHREAD_LOCK)
  848. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  849. #elif defined(USE_PTHREAD_SPINLOCK)
  850. static pthread_spinlock_t init_lock = 0;
  851. #else
  852. static BLASULONG init_lock = 0UL;
  853. #endif
  854. #endif
  855. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  856. void *sa, void *sb, BLASLONG pos) {
  857. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  858. size_t size;
  859. BLASULONG buffer;
  860. size = BUFFER_SIZE - PAGESIZE;
  861. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  862. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  863. if (hot_alloc != 2) {
  864. #endif
  865. #ifdef SMP
  866. LOCK_COMMAND(&init_lock);
  867. #endif
  868. while (size > 0) {
  869. *(int *)buffer = size;
  870. buffer += PAGESIZE;
  871. size -= PAGESIZE;
  872. }
  873. #ifdef SMP
  874. UNLOCK_COMMAND(&init_lock);
  875. #endif
  876. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  877. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  878. while (size > 0) {
  879. *(int *)buffer = size;
  880. buffer += 64;
  881. size -= 64;
  882. }
  883. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  884. }
  885. #endif
  886. #endif
  887. }
  888. #ifdef SMP
  889. static void _init_thread_memory(void *buffer) {
  890. blas_queue_t queue[MAX_CPU_NUMBER];
  891. int num_cpu;
  892. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  893. blas_queue_init(&queue[num_cpu]);
  894. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  895. queue[num_cpu].routine = &_touch_memory;
  896. queue[num_cpu].args = NULL;
  897. queue[num_cpu].next = &queue[num_cpu + 1];
  898. }
  899. queue[num_cpu - 1].next = NULL;
  900. queue[0].sa = buffer;
  901. exec_blas(num_cpu, queue);
  902. }
  903. #endif
  904. static void gotoblas_memory_init(void) {
  905. void *buffer;
  906. hot_alloc = 1;
  907. buffer = (void *)blas_memory_alloc(0);
  908. #ifdef SMP
  909. if (blas_cpu_number == 0) blas_get_cpu_number();
  910. #ifdef SMP_SERVER
  911. if (blas_server_avail == 0) blas_thread_init();
  912. #endif
  913. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  914. #else
  915. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  916. #endif
  917. blas_memory_free(buffer);
  918. }
  919. #endif
  920. /* Initialization for all function; this function should be called before main */
  921. static int gotoblas_initialized = 0;
  922. void CONSTRUCTOR gotoblas_init(void) {
  923. if (gotoblas_initialized) return;
  924. #ifdef SMP
  925. openblas_fork_handler();
  926. #endif
  927. #ifdef PROFILE
  928. moncontrol (0);
  929. #endif
  930. #ifdef DYNAMIC_ARCH
  931. gotoblas_dynamic_init();
  932. #endif
  933. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  934. gotoblas_affinity_init();
  935. #endif
  936. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  937. gotoblas_memory_init();
  938. #endif
  939. #ifdef SMP
  940. if (blas_cpu_number == 0) blas_get_cpu_number();
  941. #ifdef SMP_SERVER
  942. if (blas_server_avail == 0) blas_thread_init();
  943. #endif
  944. #endif
  945. #ifdef FUNCTION_PROFILE
  946. gotoblas_profile_init();
  947. #endif
  948. gotoblas_initialized = 1;
  949. #ifdef PROFILE
  950. moncontrol (1);
  951. #endif
  952. }
  953. void DESTRUCTOR gotoblas_quit(void) {
  954. if (gotoblas_initialized == 0) return;
  955. #ifdef PROFILE
  956. moncontrol (0);
  957. #endif
  958. #ifdef FUNCTION_PROFILE
  959. gotoblas_profile_quit();
  960. #endif
  961. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  962. gotoblas_affinity_quit();
  963. #endif
  964. #ifdef DYNAMIC_ARCH
  965. gotoblas_dynamic_quit();
  966. #endif
  967. gotoblas_initialized = 0;
  968. #ifdef PROFILE
  969. moncontrol (1);
  970. #endif
  971. blas_shutdown();
  972. }
  973. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  974. /* Don't call me; this is just work around for PGI / Sun bug */
  975. void gotoblas_dummy_for_PGI(void) {
  976. gotoblas_init();
  977. gotoblas_quit();
  978. #if 0
  979. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  980. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  981. #else
  982. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  983. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  984. #endif
  985. }
  986. #endif