You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 31 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377
  1. /*****************************************************************************
  2. Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************/
  28. /* Copyright 2009, 2010 The University of Texas at Austin. */
  29. /* All rights reserved. */
  30. /* */
  31. /* Redistribution and use in source and binary forms, with or */
  32. /* without modification, are permitted provided that the following */
  33. /* conditions are met: */
  34. /* */
  35. /* 1. Redistributions of source code must retain the above */
  36. /* copyright notice, this list of conditions and the following */
  37. /* disclaimer. */
  38. /* */
  39. /* 2. Redistributions in binary form must reproduce the above */
  40. /* copyright notice, this list of conditions and the following */
  41. /* disclaimer in the documentation and/or other materials */
  42. /* provided with the distribution. */
  43. /* */
  44. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  45. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  46. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  47. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  48. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  49. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  50. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  51. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  52. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  53. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  54. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  55. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  56. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  57. /* POSSIBILITY OF SUCH DAMAGE. */
  58. /* */
  59. /* The views and conclusions contained in the software and */
  60. /* documentation are those of the authors and should not be */
  61. /* interpreted as representing official policies, either expressed */
  62. /* or implied, of The University of Texas at Austin. */
  63. /*********************************************************************/
  64. //#undef DEBUG
  65. #include "common.h"
  66. #include <errno.h>
  67. #ifdef OS_WINDOWS
  68. #define ALLOC_WINDOWS
  69. #ifndef MEM_LARGE_PAGES
  70. #define MEM_LARGE_PAGES 0x20000000
  71. #endif
  72. #else
  73. #define ALLOC_MMAP
  74. #define ALLOC_MALLOC
  75. #endif
  76. #include <stdlib.h>
  77. #include <stdio.h>
  78. #include <fcntl.h>
  79. #ifndef OS_WINDOWS
  80. #include <sys/mman.h>
  81. #include <sys/shm.h>
  82. #include <sys/ipc.h>
  83. #endif
  84. #include <sys/types.h>
  85. #ifdef OS_LINUX
  86. #include <sys/sysinfo.h>
  87. #include <sched.h>
  88. #include <errno.h>
  89. #include <linux/unistd.h>
  90. #include <sys/syscall.h>
  91. #endif
  92. #if defined(OS_FREEBSD) || defined(OS_DARWIN)
  93. #include <sys/sysctl.h>
  94. #include <sys/resource.h>
  95. #endif
  96. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  97. #include <conio.h>
  98. #undef printf
  99. #define printf _cprintf
  100. #endif
  101. #ifdef OS_LINUX
  102. #ifndef MPOL_PREFERRED
  103. #define MPOL_PREFERRED 1
  104. #endif
  105. #endif
  106. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  107. #define NO_WARMUP
  108. #endif
  109. #ifndef SHM_HUGETLB
  110. #define SHM_HUGETLB 04000
  111. #endif
  112. #ifndef FIXED_PAGESIZE
  113. #define FIXED_PAGESIZE 4096
  114. #endif
  115. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  116. #define CONSTRUCTOR __attribute__ ((constructor))
  117. #define DESTRUCTOR __attribute__ ((destructor))
  118. #ifdef DYNAMIC_ARCH
  119. gotoblas_t *gotoblas = NULL;
  120. #endif
  121. extern void openblas_warning(int verbose, const char * msg);
  122. #ifndef SMP
  123. #define blas_cpu_number 1
  124. #define blas_num_threads 1
  125. /* Dummy Function */
  126. int goto_get_num_procs (void) { return 1;};
  127. void goto_set_num_threads(int num_threads) {};
  128. #else
  129. #ifdef OS_LINUX
  130. #ifndef NO_AFFINITY
  131. int get_num_procs(void);
  132. #else
  133. int get_num_procs(void) {
  134. static int nums = 0;
  135. if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
  136. return nums;
  137. }
  138. #endif
  139. #endif
  140. #ifdef OS_WINDOWS
  141. int get_num_procs(void) {
  142. static int nums = 0;
  143. if (nums == 0) {
  144. SYSTEM_INFO sysinfo;
  145. GetSystemInfo(&sysinfo);
  146. nums = sysinfo.dwNumberOfProcessors;
  147. }
  148. return nums;
  149. }
  150. #endif
  151. #if defined(OS_FREEBSD)
  152. int get_num_procs(void) {
  153. static int nums = 0;
  154. int m[2];
  155. size_t len;
  156. if (nums == 0) {
  157. m[0] = CTL_HW;
  158. m[1] = HW_NCPU;
  159. len = sizeof(int);
  160. sysctl(m, 2, &nums, &len, NULL, 0);
  161. }
  162. return nums;
  163. }
  164. #endif
  165. #if defined(OS_DARWIN)
  166. int get_num_procs(void) {
  167. static int nums = 0;
  168. size_t len;
  169. if (nums == 0){
  170. len = sizeof(int);
  171. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  172. }
  173. return nums;
  174. }
  175. /*
  176. void set_stack_limit(int limitMB){
  177. int result=0;
  178. struct rlimit rl;
  179. rlim_t StackSize;
  180. StackSize=limitMB*1024*1024;
  181. result=getrlimit(RLIMIT_STACK, &rl);
  182. if(result==0){
  183. if(rl.rlim_cur < StackSize){
  184. rl.rlim_cur=StackSize;
  185. result=setrlimit(RLIMIT_STACK, &rl);
  186. if(result !=0){
  187. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  188. }
  189. }
  190. }
  191. }
  192. */
  193. #endif
  194. /*
  195. OpenBLAS uses the numbers of CPU cores in multithreading.
  196. It can be set by openblas_set_num_threads(int num_threads);
  197. */
  198. int blas_cpu_number = 0;
  199. /*
  200. The numbers of threads in the thread pool.
  201. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  202. */
  203. int blas_num_threads = 0;
  204. int goto_get_num_procs (void) {
  205. return blas_cpu_number;
  206. }
  207. void openblas_fork_handler()
  208. {
  209. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  210. // built with "make USE_OPENMP=0".
  211. // Hanging can still happen when OpenBLAS is built against the libgomp
  212. // implementation of OpenMP. The problem is tracked at:
  213. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  214. // In the mean time build with USE_OPENMP=0 or link against another
  215. // implementation of OpenMP.
  216. #if !defined(OS_WINDOWS) && defined(SMP_SERVER)
  217. int err;
  218. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
  219. if(err != 0)
  220. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  221. #endif
  222. }
  223. int blas_get_cpu_number(void){
  224. env_var_t p;
  225. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  226. int max_num;
  227. #endif
  228. int blas_goto_num = 0;
  229. int blas_omp_num = 0;
  230. if (blas_num_threads) return blas_num_threads;
  231. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  232. max_num = get_num_procs();
  233. #endif
  234. blas_goto_num = 0;
  235. #ifndef USE_OPENMP
  236. if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
  237. if (blas_goto_num < 0) blas_goto_num = 0;
  238. if (blas_goto_num == 0) {
  239. if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
  240. if (blas_goto_num < 0) blas_goto_num = 0;
  241. }
  242. #endif
  243. blas_omp_num = 0;
  244. if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
  245. if (blas_omp_num < 0) blas_omp_num = 0;
  246. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  247. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  248. else blas_num_threads = MAX_CPU_NUMBER;
  249. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  250. if (blas_num_threads > max_num) blas_num_threads = max_num;
  251. #endif
  252. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  253. #ifdef DEBUG
  254. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  255. #endif
  256. blas_cpu_number = blas_num_threads;
  257. return blas_num_threads;
  258. }
  259. #endif
  260. struct release_t {
  261. void *address;
  262. void (*func)(struct release_t *);
  263. long attr;
  264. };
  265. int hugetlb_allocated = 0;
  266. static struct release_t release_info[NUM_BUFFERS];
  267. static int release_pos = 0;
  268. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  269. static int hot_alloc = 0;
  270. #endif
  271. #ifdef ALLOC_MMAP
  272. static void alloc_mmap_free(struct release_t *release){
  273. if (munmap(release -> address, BUFFER_SIZE)) {
  274. printf("OpenBLAS : munmap failed\n");
  275. }
  276. }
  277. #ifdef NO_WARMUP
  278. static void *alloc_mmap(void *address){
  279. void *map_address;
  280. if (address){
  281. map_address = mmap(address,
  282. BUFFER_SIZE,
  283. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  284. } else {
  285. map_address = mmap(address,
  286. BUFFER_SIZE,
  287. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  288. }
  289. if (map_address != (void *)-1) {
  290. release_info[release_pos].address = map_address;
  291. release_info[release_pos].func = alloc_mmap_free;
  292. release_pos ++;
  293. }
  294. #ifdef OS_LINUX
  295. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  296. #endif
  297. return map_address;
  298. }
  299. #else
  300. #define BENCH_ITERATION 4
  301. #define SCALING 2
  302. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  303. BLASULONG original, *p;
  304. BLASULONG start, stop, min;
  305. int iter, i, count;
  306. min = (BLASULONG)-1;
  307. original = *(BLASULONG *)(address + size - PAGESIZE);
  308. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  309. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  310. p = (BLASULONG *)address;
  311. count = size / PAGESIZE;
  312. start = rpcc();
  313. for (i = 0; i < count; i ++) {
  314. p = (BLASULONG *)(*p);
  315. }
  316. stop = rpcc();
  317. if (min > stop - start) min = stop - start;
  318. }
  319. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  320. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  321. return min;
  322. }
  323. static void *alloc_mmap(void *address){
  324. void *map_address, *best_address;
  325. BLASULONG best, start, current;
  326. BLASULONG allocsize;
  327. if (address){
  328. /* Just give up use advanced operation */
  329. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  330. #ifdef OS_LINUX
  331. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  332. #endif
  333. } else {
  334. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  335. if (hot_alloc == 0) {
  336. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  337. #ifdef OS_LINUX
  338. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  339. #endif
  340. } else {
  341. #endif
  342. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  343. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  344. if (map_address != (void *)-1) {
  345. #ifdef OS_LINUX
  346. #ifdef DEBUG
  347. int ret=0;
  348. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  349. if(ret==-1){
  350. int errsv=errno;
  351. perror("OpenBLAS alloc_mmap:");
  352. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  353. }
  354. #else
  355. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  356. #endif
  357. #endif
  358. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  359. start = (BLASULONG)map_address;
  360. current = (SCALING - 1) * BUFFER_SIZE;
  361. while(current > 0) {
  362. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  363. start += PAGESIZE;
  364. current -= PAGESIZE;
  365. }
  366. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  367. start = (BLASULONG)map_address;
  368. best = (BLASULONG)-1;
  369. best_address = map_address;
  370. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  371. current = run_bench(start, allocsize);
  372. if (best > current) {
  373. best = current;
  374. best_address = (void *)start;
  375. }
  376. start += PAGESIZE;
  377. }
  378. if ((BLASULONG)best_address > (BLASULONG)map_address)
  379. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  380. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  381. map_address = best_address;
  382. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  383. hot_alloc = 2;
  384. #endif
  385. }
  386. }
  387. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  388. }
  389. #endif
  390. if (map_address != (void *)-1) {
  391. release_info[release_pos].address = map_address;
  392. release_info[release_pos].func = alloc_mmap_free;
  393. release_pos ++;
  394. }
  395. return map_address;
  396. }
  397. #endif
  398. #endif
  399. #ifdef ALLOC_MALLOC
  400. static void alloc_malloc_free(struct release_t *release){
  401. free(release -> address);
  402. }
  403. static void *alloc_malloc(void *address){
  404. void *map_address;
  405. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  406. if (map_address == (void *)NULL) map_address = (void *)-1;
  407. if (map_address != (void *)-1) {
  408. release_info[release_pos].address = map_address;
  409. release_info[release_pos].func = alloc_malloc_free;
  410. release_pos ++;
  411. }
  412. return map_address;
  413. }
  414. #endif
  415. #ifdef ALLOC_QALLOC
  416. void *qalloc(int flags, size_t bytes);
  417. void *qfree (void *address);
  418. #define QNONCACHE 0x1
  419. #define QCOMMS 0x2
  420. #define QFAST 0x4
  421. static void alloc_qalloc_free(struct release_t *release){
  422. qfree(release -> address);
  423. }
  424. static void *alloc_qalloc(void *address){
  425. void *map_address;
  426. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  427. if (map_address == (void *)NULL) map_address = (void *)-1;
  428. if (map_address != (void *)-1) {
  429. release_info[release_pos].address = map_address;
  430. release_info[release_pos].func = alloc_qalloc_free;
  431. release_pos ++;
  432. }
  433. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  434. }
  435. #endif
  436. #ifdef ALLOC_WINDOWS
  437. static void alloc_windows_free(struct release_t *release){
  438. VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
  439. }
  440. static void *alloc_windows(void *address){
  441. void *map_address;
  442. map_address = VirtualAlloc(address,
  443. BUFFER_SIZE,
  444. MEM_RESERVE | MEM_COMMIT,
  445. PAGE_READWRITE);
  446. if (map_address == (void *)NULL) map_address = (void *)-1;
  447. if (map_address != (void *)-1) {
  448. release_info[release_pos].address = map_address;
  449. release_info[release_pos].func = alloc_windows_free;
  450. release_pos ++;
  451. }
  452. return map_address;
  453. }
  454. #endif
  455. #ifdef ALLOC_DEVICEDRIVER
  456. #ifndef DEVICEDRIVER_NAME
  457. #define DEVICEDRIVER_NAME "/dev/mapper"
  458. #endif
  459. static void alloc_devicedirver_free(struct release_t *release){
  460. if (munmap(release -> address, BUFFER_SIZE)) {
  461. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  462. }
  463. if (close(release -> attr)) {
  464. printf("OpenBLAS : Bugphysarea close failed.\n");
  465. }
  466. }
  467. static void *alloc_devicedirver(void *address){
  468. int fd;
  469. void *map_address;
  470. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  471. return (void *)-1;
  472. }
  473. map_address = mmap(address, BUFFER_SIZE,
  474. PROT_READ | PROT_WRITE,
  475. MAP_FILE | MAP_SHARED,
  476. fd, 0);
  477. if (map_address != (void *)-1) {
  478. release_info[release_pos].address = map_address;
  479. release_info[release_pos].attr = fd;
  480. release_info[release_pos].func = alloc_devicedirver_free;
  481. release_pos ++;
  482. }
  483. return map_address;
  484. }
  485. #endif
  486. #ifdef ALLOC_SHM
  487. static void alloc_shm_free(struct release_t *release){
  488. if (shmdt(release -> address)) {
  489. printf("OpenBLAS : Shared memory unmap failed.\n");
  490. }
  491. }
  492. static void *alloc_shm(void *address){
  493. void *map_address;
  494. int shmid;
  495. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  496. map_address = (void *)shmat(shmid, address, 0);
  497. if (map_address != (void *)-1){
  498. #ifdef OS_LINUX
  499. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  500. #endif
  501. shmctl(shmid, IPC_RMID, 0);
  502. release_info[release_pos].address = map_address;
  503. release_info[release_pos].attr = shmid;
  504. release_info[release_pos].func = alloc_shm_free;
  505. release_pos ++;
  506. }
  507. return map_address;
  508. }
  509. #endif
  510. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  511. static void alloc_hugetlb_free(struct release_t *release){
  512. #if defined(OS_LINUX) || defined(OS_AIX)
  513. if (shmdt(release -> address)) {
  514. printf("OpenBLAS : Hugepage unmap failed.\n");
  515. }
  516. #endif
  517. #ifdef __sun__
  518. munmap(release -> address, BUFFER_SIZE);
  519. #endif
  520. #ifdef OS_WINDOWS
  521. VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
  522. #endif
  523. }
  524. static void *alloc_hugetlb(void *address){
  525. void *map_address = (void *)-1;
  526. #if defined(OS_LINUX) || defined(OS_AIX)
  527. int shmid;
  528. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  529. #ifdef OS_LINUX
  530. SHM_HUGETLB |
  531. #endif
  532. #ifdef OS_AIX
  533. SHM_LGPAGE | SHM_PIN |
  534. #endif
  535. IPC_CREAT | SHM_R | SHM_W);
  536. if (shmid != -1) {
  537. map_address = (void *)shmat(shmid, address, SHM_RND);
  538. #ifdef OS_LINUX
  539. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  540. #endif
  541. if (map_address != (void *)-1){
  542. shmctl(shmid, IPC_RMID, 0);
  543. }
  544. }
  545. #endif
  546. #ifdef __sun__
  547. struct memcntl_mha mha;
  548. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  549. mha.mha_flags = 0;
  550. mha.mha_pagesize = HUGE_PAGESIZE;
  551. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  552. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  553. #endif
  554. #ifdef OS_WINDOWS
  555. HANDLE hToken;
  556. TOKEN_PRIVILEGES tp;
  557. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  558. tp.PrivilegeCount = 1;
  559. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  560. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  561. CloseHandle(hToken);
  562. return -1;
  563. }
  564. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  565. CloseHandle(hToken);
  566. return -1;
  567. }
  568. map_address = (void *)VirtualAlloc(address,
  569. BUFFER_SIZE,
  570. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  571. PAGE_READWRITE);
  572. tp.Privileges[0].Attributes = 0;
  573. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  574. if (map_address == (void *)NULL) map_address = (void *)-1;
  575. #endif
  576. if (map_address != (void *)-1){
  577. release_info[release_pos].address = map_address;
  578. release_info[release_pos].func = alloc_hugetlb_free;
  579. release_pos ++;
  580. }
  581. return map_address;
  582. }
  583. #endif
  584. #ifdef ALLOC_HUGETLBFILE
  585. static int hugetlb_pid = 0;
  586. static void alloc_hugetlbfile_free(struct release_t *release){
  587. if (munmap(release -> address, BUFFER_SIZE)) {
  588. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  589. }
  590. if (close(release -> attr)) {
  591. printf("OpenBLAS : HugeTLBfs close failed.\n");
  592. }
  593. }
  594. static void *alloc_hugetlbfile(void *address){
  595. void *map_address = (void *)-1;
  596. int fd;
  597. char filename[64];
  598. if (!hugetlb_pid) hugetlb_pid = getpid();
  599. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  600. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  601. return (void *)-1;
  602. }
  603. unlink(filename);
  604. map_address = mmap(address, BUFFER_SIZE,
  605. PROT_READ | PROT_WRITE,
  606. MAP_SHARED,
  607. fd, 0);
  608. if (map_address != (void *)-1) {
  609. release_info[release_pos].address = map_address;
  610. release_info[release_pos].attr = fd;
  611. release_info[release_pos].func = alloc_hugetlbfile_free;
  612. release_pos ++;
  613. }
  614. return map_address;
  615. }
  616. #endif
  617. /* Global lock for memory allocation */
  618. #if defined(USE_PTHREAD_LOCK)
  619. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  620. #elif defined(USE_PTHREAD_SPINLOCK)
  621. static pthread_spinlock_t alloc_lock = 0;
  622. #else
  623. static BLASULONG alloc_lock = 0UL;
  624. #endif
  625. #ifdef SEEK_ADDRESS
  626. static BLASULONG base_address = 0UL;
  627. #else
  628. static BLASULONG base_address = BASE_ADDRESS;
  629. #endif
  630. static volatile struct {
  631. BLASULONG lock;
  632. void *addr;
  633. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  634. int pos;
  635. #endif
  636. int used;
  637. #ifndef __64BIT__
  638. char dummy[48];
  639. #else
  640. char dummy[40];
  641. #endif
  642. } memory[NUM_BUFFERS];
  643. static int memory_initialized = 0;
  644. static void gotoblas_memory_init(void);
  645. /* Memory allocation routine */
  646. /* procpos ... indicates where it comes from */
  647. /* 0 : Level 3 functions */
  648. /* 1 : Level 2 functions */
  649. /* 2 : Thread */
  650. void *blas_memory_alloc(int procpos){
  651. int position;
  652. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  653. int mypos;
  654. #endif
  655. void *map_address;
  656. void *(*memoryalloc[])(void *address) = {
  657. #ifdef ALLOC_DEVICEDRIVER
  658. alloc_devicedirver,
  659. #endif
  660. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  661. alloc_hugetlb,
  662. #endif
  663. #ifdef ALLOC_SHM
  664. alloc_shm,
  665. #endif
  666. #ifdef ALLOC_MMAP
  667. alloc_mmap,
  668. #endif
  669. #ifdef ALLOC_QALLOC
  670. alloc_qalloc,
  671. #endif
  672. #ifdef ALLOC_WINDOWS
  673. alloc_windows,
  674. #endif
  675. #ifdef ALLOC_MALLOC
  676. alloc_malloc,
  677. #endif
  678. NULL,
  679. };
  680. void *(**func)(void *address);
  681. if (!memory_initialized) {
  682. LOCK_COMMAND(&alloc_lock);
  683. if (!memory_initialized) {
  684. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  685. for (position = 0; position < NUM_BUFFERS; position ++){
  686. memory[position].addr = (void *)0;
  687. memory[position].pos = -1;
  688. memory[position].used = 0;
  689. memory[position].lock = 0;
  690. }
  691. #endif
  692. #ifdef DYNAMIC_ARCH
  693. gotoblas_dynamic_init();
  694. #endif
  695. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  696. gotoblas_affinity_init();
  697. #endif
  698. #ifdef SMP
  699. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  700. #endif
  701. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
  702. #ifndef DYNAMIC_ARCH
  703. blas_set_parameter();
  704. #endif
  705. #endif
  706. memory_initialized = 1;
  707. }
  708. UNLOCK_COMMAND(&alloc_lock);
  709. }
  710. #ifdef DEBUG
  711. printf("Alloc Start ...\n");
  712. #endif
  713. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  714. mypos = WhereAmI();
  715. position = mypos;
  716. while (position > NUM_BUFFERS) position >>= 1;
  717. do {
  718. if (!memory[position].used && (memory[position].pos == mypos)) {
  719. blas_lock(&memory[position].lock);
  720. if (!memory[position].used) goto allocation;
  721. blas_unlock(&memory[position].lock);
  722. }
  723. position ++;
  724. } while (position < NUM_BUFFERS);
  725. #endif
  726. position = 0;
  727. do {
  728. if (!memory[position].used) {
  729. blas_lock(&memory[position].lock);
  730. if (!memory[position].used) goto allocation;
  731. blas_unlock(&memory[position].lock);
  732. }
  733. position ++;
  734. } while (position < NUM_BUFFERS);
  735. goto error;
  736. allocation :
  737. #ifdef DEBUG
  738. printf(" Position -> %d\n", position);
  739. #endif
  740. memory[position].used = 1;
  741. blas_unlock(&memory[position].lock);
  742. if (!memory[position].addr) {
  743. do {
  744. #ifdef DEBUG
  745. printf("Allocation Start : %lx\n", base_address);
  746. #endif
  747. map_address = (void *)-1;
  748. func = &memoryalloc[0];
  749. while ((func != NULL) && (map_address == (void *) -1)) {
  750. map_address = (*func)((void *)base_address);
  751. #ifdef ALLOC_DEVICEDRIVER
  752. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  753. fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
  754. }
  755. #endif
  756. #ifdef ALLOC_HUGETLBFILE
  757. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  758. #ifndef OS_WINDOWS
  759. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  760. #endif
  761. }
  762. #endif
  763. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  764. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  765. #endif
  766. func ++;
  767. }
  768. #ifdef DEBUG
  769. printf(" Success -> %08lx\n", map_address);
  770. #endif
  771. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  772. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  773. } while ((BLASLONG)map_address == -1);
  774. memory[position].addr = map_address;
  775. #ifdef DEBUG
  776. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  777. #endif
  778. }
  779. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  780. if (memory[position].pos == -1) memory[position].pos = mypos;
  781. #endif
  782. #ifdef DYNAMIC_ARCH
  783. if (memory_initialized == 1) {
  784. LOCK_COMMAND(&alloc_lock);
  785. if (memory_initialized == 1) {
  786. if (!gotoblas) gotoblas_dynamic_init();
  787. memory_initialized = 2;
  788. }
  789. UNLOCK_COMMAND(&alloc_lock);
  790. }
  791. #endif
  792. #ifdef DEBUG
  793. printf("Mapped : %p %3d\n\n",
  794. (void *)memory[position].addr, position);
  795. #endif
  796. return (void *)memory[position].addr;
  797. error:
  798. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  799. return NULL;
  800. }
  801. void blas_memory_free(void *free_area){
  802. int position;
  803. #ifdef DEBUG
  804. printf("Unmapped Start : %p ...\n", free_area);
  805. #endif
  806. position = 0;
  807. while ((memory[position].addr != free_area)
  808. && (position < NUM_BUFFERS)) position++;
  809. if (memory[position].addr != free_area) goto error;
  810. #ifdef DEBUG
  811. printf(" Position : %d\n", position);
  812. #endif
  813. memory[position].used = 0;
  814. #ifdef DEBUG
  815. printf("Unmap Succeeded.\n\n");
  816. #endif
  817. return;
  818. error:
  819. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  820. #ifdef DEBUG
  821. for (position = 0; position < NUM_BUFFERS; position++)
  822. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  823. #endif
  824. return;
  825. }
  826. void blas_shutdown(void){
  827. int pos;
  828. #ifdef SMP
  829. BLASFUNC(blas_thread_shutdown)();
  830. #endif
  831. LOCK_COMMAND(&alloc_lock);
  832. for (pos = 0; pos < release_pos; pos ++) {
  833. release_info[pos].func(&release_info[pos]);
  834. }
  835. #ifdef SEEK_ADDRESS
  836. base_address = 0UL;
  837. #else
  838. base_address = BASE_ADDRESS;
  839. #endif
  840. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  841. memory[pos].addr = (void *)0;
  842. memory[pos].used = 0;
  843. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  844. memory[pos].pos = -1;
  845. #endif
  846. memory[pos].lock = 0;
  847. }
  848. UNLOCK_COMMAND(&alloc_lock);
  849. return;
  850. }
  851. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  852. #ifdef SMP
  853. #if defined(USE_PTHREAD_LOCK)
  854. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  855. #elif defined(USE_PTHREAD_SPINLOCK)
  856. static pthread_spinlock_t init_lock = 0;
  857. #else
  858. static BLASULONG init_lock = 0UL;
  859. #endif
  860. #endif
  861. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  862. void *sa, void *sb, BLASLONG pos) {
  863. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  864. size_t size;
  865. BLASULONG buffer;
  866. size = BUFFER_SIZE - PAGESIZE;
  867. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  868. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  869. if (hot_alloc != 2) {
  870. #endif
  871. #ifdef SMP
  872. LOCK_COMMAND(&init_lock);
  873. #endif
  874. while (size > 0) {
  875. *(int *)buffer = size;
  876. buffer += PAGESIZE;
  877. size -= PAGESIZE;
  878. }
  879. #ifdef SMP
  880. UNLOCK_COMMAND(&init_lock);
  881. #endif
  882. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  883. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  884. while (size > 0) {
  885. *(int *)buffer = size;
  886. buffer += 64;
  887. size -= 64;
  888. }
  889. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  890. }
  891. #endif
  892. #endif
  893. }
  894. #ifdef SMP
  895. static void _init_thread_memory(void *buffer) {
  896. blas_queue_t queue[MAX_CPU_NUMBER];
  897. int num_cpu;
  898. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  899. blas_queue_init(&queue[num_cpu]);
  900. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  901. queue[num_cpu].routine = &_touch_memory;
  902. queue[num_cpu].args = NULL;
  903. queue[num_cpu].next = &queue[num_cpu + 1];
  904. }
  905. queue[num_cpu - 1].next = NULL;
  906. queue[0].sa = buffer;
  907. exec_blas(num_cpu, queue);
  908. }
  909. #endif
  910. static void gotoblas_memory_init(void) {
  911. void *buffer;
  912. hot_alloc = 1;
  913. buffer = (void *)blas_memory_alloc(0);
  914. #ifdef SMP
  915. if (blas_cpu_number == 0) blas_get_cpu_number();
  916. #ifdef SMP_SERVER
  917. if (blas_server_avail == 0) blas_thread_init();
  918. #endif
  919. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  920. #else
  921. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  922. #endif
  923. blas_memory_free(buffer);
  924. }
  925. #endif
  926. /* Initialization for all function; this function should be called before main */
  927. static int gotoblas_initialized = 0;
  928. void CONSTRUCTOR gotoblas_init(void) {
  929. if (gotoblas_initialized) return;
  930. #ifdef SMP
  931. openblas_fork_handler();
  932. #endif
  933. #ifdef PROFILE
  934. moncontrol (0);
  935. #endif
  936. #ifdef DYNAMIC_ARCH
  937. gotoblas_dynamic_init();
  938. #endif
  939. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  940. gotoblas_affinity_init();
  941. #endif
  942. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  943. gotoblas_memory_init();
  944. #endif
  945. #ifdef SMP
  946. if (blas_cpu_number == 0) blas_get_cpu_number();
  947. #ifdef SMP_SERVER
  948. if (blas_server_avail == 0) blas_thread_init();
  949. #endif
  950. #endif
  951. #ifdef FUNCTION_PROFILE
  952. gotoblas_profile_init();
  953. #endif
  954. gotoblas_initialized = 1;
  955. #ifdef PROFILE
  956. moncontrol (1);
  957. #endif
  958. }
  959. void DESTRUCTOR gotoblas_quit(void) {
  960. if (gotoblas_initialized == 0) return;
  961. #ifdef PROFILE
  962. moncontrol (0);
  963. #endif
  964. #ifdef FUNCTION_PROFILE
  965. gotoblas_profile_quit();
  966. #endif
  967. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  968. gotoblas_affinity_quit();
  969. #endif
  970. #ifdef DYNAMIC_ARCH
  971. gotoblas_dynamic_quit();
  972. #endif
  973. gotoblas_initialized = 0;
  974. #ifdef PROFILE
  975. moncontrol (1);
  976. #endif
  977. blas_shutdown();
  978. }
  979. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  980. /* Don't call me; this is just work around for PGI / Sun bug */
  981. void gotoblas_dummy_for_PGI(void) {
  982. gotoblas_init();
  983. gotoblas_quit();
  984. #if 0
  985. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  986. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  987. #else
  988. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  989. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  990. #endif
  991. }
  992. #endif