You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 31 kB


  1. /*****************************************************************************
  2. Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************/
  28. /* Copyright 2009, 2010 The University of Texas at Austin. */
  29. /* All rights reserved. */
  30. /* */
  31. /* Redistribution and use in source and binary forms, with or */
  32. /* without modification, are permitted provided that the following */
  33. /* conditions are met: */
  34. /* */
  35. /* 1. Redistributions of source code must retain the above */
  36. /* copyright notice, this list of conditions and the following */
  37. /* disclaimer. */
  38. /* */
  39. /* 2. Redistributions in binary form must reproduce the above */
  40. /* copyright notice, this list of conditions and the following */
  41. /* disclaimer in the documentation and/or other materials */
  42. /* provided with the distribution. */
  43. /* */
  44. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  45. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  46. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  47. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  48. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  49. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  50. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  51. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  52. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  53. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  54. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  55. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  56. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  57. /* POSSIBILITY OF SUCH DAMAGE. */
  58. /* */
  59. /* The views and conclusions contained in the software and */
  60. /* documentation are those of the authors and should not be */
  61. /* interpreted as representing official policies, either expressed */
  62. /* or implied, of The University of Texas at Austin. */
  63. /*********************************************************************/
  64. //#undef DEBUG
  65. #include "common.h"
  66. #include <errno.h>
  67. #ifdef OS_WINDOWS
  68. #define ALLOC_WINDOWS
  69. #ifndef MEM_LARGE_PAGES
  70. #define MEM_LARGE_PAGES 0x20000000
  71. #endif
  72. #else
  73. #define ALLOC_MMAP
  74. #define ALLOC_MALLOC
  75. #endif
  76. #include <stdlib.h>
  77. #include <stdio.h>
  78. #include <fcntl.h>
  79. #ifndef OS_WINDOWS
  80. #include <sys/mman.h>
  81. #include <sys/shm.h>
  82. #include <sys/ipc.h>
  83. #endif
  84. #include <sys/types.h>
  85. #ifdef OS_LINUX
  86. #include <sys/sysinfo.h>
  87. #include <sched.h>
  88. #include <errno.h>
  89. #include <linux/unistd.h>
  90. #include <sys/syscall.h>
  91. #endif
  92. #if defined(OS_FREEBSD) || defined(OS_DARWIN)
  93. #include <sys/sysctl.h>
  94. #include <sys/resource.h>
  95. #endif
  96. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  97. #include <conio.h>
  98. #undef printf
  99. #define printf _cprintf
  100. #endif
  101. #ifdef OS_LINUX
  102. #ifndef MPOL_PREFERRED
  103. #define MPOL_PREFERRED 1
  104. #endif
  105. #endif
  106. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  107. #define NO_WARMUP
  108. #endif
  109. #ifndef SHM_HUGETLB
  110. #define SHM_HUGETLB 04000
  111. #endif
  112. #ifndef FIXED_PAGESIZE
  113. #define FIXED_PAGESIZE 4096
  114. #endif
  115. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  116. #define CONSTRUCTOR __attribute__ ((constructor))
  117. #define DESTRUCTOR __attribute__ ((destructor))
  118. #ifdef DYNAMIC_ARCH
  119. gotoblas_t *gotoblas = NULL;
  120. #endif
  121. #ifndef SMP
  122. #define blas_cpu_number 1
  123. #define blas_num_threads 1
  124. /* Dummy Function */
  125. int goto_get_num_procs (void) { return 1;};
  126. void goto_set_num_threads(int num_threads) {};
  127. #else
  128. #ifdef OS_LINUX
  129. #ifndef NO_AFFINITY
  130. int get_num_procs(void);
  131. #else
  132. int get_num_procs(void) {
  133. static int nums = 0;
  134. if (!nums) nums = get_nprocs();
  135. return nums;
  136. }
  137. #endif
  138. #endif
  139. #ifdef OS_WINDOWS
  140. int get_num_procs(void) {
  141. static int nums = 0;
  142. if (nums == 0) {
  143. SYSTEM_INFO sysinfo;
  144. GetSystemInfo(&sysinfo);
  145. nums = sysinfo.dwNumberOfProcessors;
  146. }
  147. return nums;
  148. }
  149. #endif
  150. #if defined(OS_FREEBSD)
  151. int get_num_procs(void) {
  152. static int nums = 0;
  153. int m[2];
  154. size_t len;
  155. if (nums == 0) {
  156. m[0] = CTL_HW;
  157. m[1] = HW_NCPU;
  158. len = sizeof(int);
  159. sysctl(m, 2, &nums, &len, NULL, 0);
  160. }
  161. return nums;
  162. }
  163. #endif
  164. #if defined(OS_DARWIN)
  165. int get_num_procs(void) {
  166. static int nums = 0;
  167. size_t len;
  168. if (nums == 0){
  169. len = sizeof(int);
  170. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  171. }
  172. return nums;
  173. }
  174. /*
  175. void set_stack_limit(int limitMB){
  176. int result=0;
  177. struct rlimit rl;
  178. rlim_t StackSize;
  179. StackSize=limitMB*1024*1024;
  180. result=getrlimit(RLIMIT_STACK, &rl);
  181. if(result==0){
  182. if(rl.rlim_cur < StackSize){
  183. rl.rlim_cur=StackSize;
  184. result=setrlimit(RLIMIT_STACK, &rl);
  185. if(result !=0){
  186. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  187. }
  188. }
  189. }
  190. }
  191. */
  192. #endif
  193. /*
  194. OpenBLAS uses the numbers of CPU cores in multithreading.
  195. It can be set by openblas_set_num_threads(int num_threads);
  196. */
  197. int blas_cpu_number = 0;
  198. /*
  199. The numbers of threads in the thread pool.
  200. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  201. */
  202. int blas_num_threads = 0;
  203. int goto_get_num_procs (void) {
  204. return blas_cpu_number;
  205. }
  206. int blas_get_cpu_number(void){
  207. char *p;
  208. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  209. int max_num;
  210. #endif
  211. int blas_goto_num = 0;
  212. int blas_omp_num = 0;
  213. if (blas_num_threads) return blas_num_threads;
  214. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  215. max_num = get_num_procs();
  216. #endif
  217. blas_goto_num = 0;
  218. #ifndef USE_OPENMP
  219. p = getenv("OPENBLAS_NUM_THREADS");
  220. if (p) blas_goto_num = atoi(p);
  221. if (blas_goto_num < 0) blas_goto_num = 0;
  222. if (blas_goto_num == 0) {
  223. p = getenv("GOTO_NUM_THREADS");
  224. if (p) blas_goto_num = atoi(p);
  225. if (blas_goto_num < 0) blas_goto_num = 0;
  226. }
  227. #endif
  228. blas_omp_num = 0;
  229. p = getenv("OMP_NUM_THREADS");
  230. if (p) blas_omp_num = atoi(p);
  231. if (blas_omp_num < 0) blas_omp_num = 0;
  232. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  233. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  234. else blas_num_threads = MAX_CPU_NUMBER;
  235. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
  236. if (blas_num_threads > max_num) blas_num_threads = max_num;
  237. #endif
  238. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  239. #ifdef DEBUG
  240. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  241. #endif
  242. blas_cpu_number = blas_num_threads;
  243. return blas_num_threads;
  244. }
  245. #endif
  246. struct release_t {
  247. void *address;
  248. void (*func)(struct release_t *);
  249. long attr;
  250. };
  251. int hugetlb_allocated = 0;
  252. static struct release_t release_info[NUM_BUFFERS];
  253. static int release_pos = 0;
  254. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  255. static int hot_alloc = 0;
  256. #endif
  257. #ifdef ALLOC_MMAP
  258. static void alloc_mmap_free(struct release_t *release){
  259. if (munmap(release -> address, BUFFER_SIZE)) {
  260. printf("OpenBLAS : munmap failed\n");
  261. }
  262. }
  263. #ifdef NO_WARMUP
  264. static void *alloc_mmap(void *address){
  265. void *map_address;
  266. if (address){
  267. map_address = mmap(address,
  268. BUFFER_SIZE,
  269. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  270. } else {
  271. map_address = mmap(address,
  272. BUFFER_SIZE,
  273. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  274. }
  275. if (map_address != (void *)-1) {
  276. release_info[release_pos].address = map_address;
  277. release_info[release_pos].func = alloc_mmap_free;
  278. release_pos ++;
  279. }
  280. #ifdef OS_LINUX
  281. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  282. #endif
  283. return map_address;
  284. }
  285. #else
  286. #define BENCH_ITERATION 4
  287. #define SCALING 2
  288. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  289. BLASULONG original, *p;
  290. BLASULONG start, stop, min;
  291. int iter, i, count;
  292. min = (BLASULONG)-1;
  293. original = *(BLASULONG *)(address + size - PAGESIZE);
  294. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  295. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  296. p = (BLASULONG *)address;
  297. count = size / PAGESIZE;
  298. start = rpcc();
  299. for (i = 0; i < count; i ++) {
  300. p = (BLASULONG *)(*p);
  301. }
  302. stop = rpcc();
  303. if (min > stop - start) min = stop - start;
  304. }
  305. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  306. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  307. return min;
  308. }
  309. static void *alloc_mmap(void *address){
  310. void *map_address, *best_address;
  311. BLASULONG best, start, current;
  312. BLASULONG allocsize;
  313. if (address){
  314. /* Just give up use advanced operation */
  315. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  316. #ifdef OS_LINUX
  317. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  318. #endif
  319. } else {
  320. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  321. if (hot_alloc == 0) {
  322. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  323. #ifdef OS_LINUX
  324. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  325. #endif
  326. } else {
  327. #endif
  328. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  329. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  330. if (map_address != (void *)-1) {
  331. #ifdef OS_LINUX
  332. #ifdef DEBUG
  333. int ret=0;
  334. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  335. if(ret==-1){
  336. int errsv=errno;
  337. perror("OpenBLAS alloc_mmap:");
  338. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  339. }
  340. #else
  341. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  342. #endif
  343. #endif
  344. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  345. start = (BLASULONG)map_address;
  346. current = (SCALING - 1) * BUFFER_SIZE;
  347. while(current > 0) {
  348. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  349. start += PAGESIZE;
  350. current -= PAGESIZE;
  351. }
  352. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  353. start = (BLASULONG)map_address;
  354. best = (BLASULONG)-1;
  355. best_address = map_address;
  356. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  357. current = run_bench(start, allocsize);
  358. if (best > current) {
  359. best = current;
  360. best_address = (void *)start;
  361. }
  362. start += PAGESIZE;
  363. }
  364. if ((BLASULONG)best_address > (BLASULONG)map_address)
  365. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  366. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  367. map_address = best_address;
  368. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  369. hot_alloc = 2;
  370. #endif
  371. }
  372. }
  373. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  374. }
  375. #endif
  376. if (map_address != (void *)-1) {
  377. release_info[release_pos].address = map_address;
  378. release_info[release_pos].func = alloc_mmap_free;
  379. release_pos ++;
  380. }
  381. return map_address;
  382. }
  383. #endif
  384. #endif
  385. #ifdef ALLOC_MALLOC
  386. static void alloc_malloc_free(struct release_t *release){
  387. free(release -> address);
  388. }
  389. static void *alloc_malloc(void *address){
  390. void *map_address;
  391. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  392. if (map_address == (void *)NULL) map_address = (void *)-1;
  393. if (map_address != (void *)-1) {
  394. release_info[release_pos].address = map_address;
  395. release_info[release_pos].func = alloc_malloc_free;
  396. release_pos ++;
  397. }
  398. return map_address;
  399. }
  400. #endif
  401. #ifdef ALLOC_QALLOC
  402. void *qalloc(int flags, size_t bytes);
  403. void *qfree (void *address);
  404. #define QNONCACHE 0x1
  405. #define QCOMMS 0x2
  406. #define QFAST 0x4
  407. static void alloc_qalloc_free(struct release_t *release){
  408. qfree(release -> address);
  409. }
  410. static void *alloc_qalloc(void *address){
  411. void *map_address;
  412. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  413. if (map_address == (void *)NULL) map_address = (void *)-1;
  414. if (map_address != (void *)-1) {
  415. release_info[release_pos].address = map_address;
  416. release_info[release_pos].func = alloc_qalloc_free;
  417. release_pos ++;
  418. }
  419. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  420. }
  421. #endif
  422. #ifdef ALLOC_WINDOWS
  423. static void alloc_windows_free(struct release_t *release){
  424. VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
  425. }
  426. static void *alloc_windows(void *address){
  427. void *map_address;
  428. map_address = VirtualAlloc(address,
  429. BUFFER_SIZE,
  430. MEM_RESERVE | MEM_COMMIT,
  431. PAGE_READWRITE);
  432. if (map_address == (void *)NULL) map_address = (void *)-1;
  433. if (map_address != (void *)-1) {
  434. release_info[release_pos].address = map_address;
  435. release_info[release_pos].func = alloc_windows_free;
  436. release_pos ++;
  437. }
  438. return map_address;
  439. }
  440. #endif
  441. #ifdef ALLOC_DEVICEDRIVER
  442. #ifndef DEVICEDRIVER_NAME
  443. #define DEVICEDRIVER_NAME "/dev/mapper"
  444. #endif
  445. static void alloc_devicedirver_free(struct release_t *release){
  446. if (munmap(release -> address, BUFFER_SIZE)) {
  447. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  448. }
  449. if (close(release -> attr)) {
  450. printf("OpenBLAS : Bugphysarea close failed.\n");
  451. }
  452. }
  453. static void *alloc_devicedirver(void *address){
  454. int fd;
  455. void *map_address;
  456. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  457. return (void *)-1;
  458. }
  459. map_address = mmap(address, BUFFER_SIZE,
  460. PROT_READ | PROT_WRITE,
  461. MAP_FILE | MAP_SHARED,
  462. fd, 0);
  463. if (map_address != (void *)-1) {
  464. release_info[release_pos].address = map_address;
  465. release_info[release_pos].attr = fd;
  466. release_info[release_pos].func = alloc_devicedirver_free;
  467. release_pos ++;
  468. }
  469. return map_address;
  470. }
  471. #endif
  472. #ifdef ALLOC_SHM
  473. static void alloc_shm_free(struct release_t *release){
  474. if (shmdt(release -> address)) {
  475. printf("OpenBLAS : Shared memory unmap failed.\n");
  476. }
  477. }
  478. static void *alloc_shm(void *address){
  479. void *map_address;
  480. int shmid;
  481. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  482. map_address = (void *)shmat(shmid, address, 0);
  483. if (map_address != (void *)-1){
  484. #ifdef OS_LINUX
  485. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  486. #endif
  487. shmctl(shmid, IPC_RMID, 0);
  488. release_info[release_pos].address = map_address;
  489. release_info[release_pos].attr = shmid;
  490. release_info[release_pos].func = alloc_shm_free;
  491. release_pos ++;
  492. }
  493. return map_address;
  494. }
  495. #endif
  496. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  497. static void alloc_hugetlb_free(struct release_t *release){
  498. #if defined(OS_LINUX) || defined(OS_AIX)
  499. if (shmdt(release -> address)) {
  500. printf("OpenBLAS : Hugepage unmap failed.\n");
  501. }
  502. #endif
  503. #ifdef __sun__
  504. munmap(release -> address, BUFFER_SIZE);
  505. #endif
  506. #ifdef OS_WINDOWS
  507. VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
  508. #endif
  509. }
  510. static void *alloc_hugetlb(void *address){
  511. void *map_address = (void *)-1;
  512. #if defined(OS_LINUX) || defined(OS_AIX)
  513. int shmid;
  514. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  515. #ifdef OS_LINUX
  516. SHM_HUGETLB |
  517. #endif
  518. #ifdef OS_AIX
  519. SHM_LGPAGE | SHM_PIN |
  520. #endif
  521. IPC_CREAT | SHM_R | SHM_W);
  522. if (shmid != -1) {
  523. map_address = (void *)shmat(shmid, address, SHM_RND);
  524. #ifdef OS_LINUX
  525. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  526. #endif
  527. if (map_address != (void *)-1){
  528. shmctl(shmid, IPC_RMID, 0);
  529. }
  530. }
  531. #endif
  532. #ifdef __sun__
  533. struct memcntl_mha mha;
  534. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  535. mha.mha_flags = 0;
  536. mha.mha_pagesize = HUGE_PAGESIZE;
  537. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  538. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  539. #endif
  540. #ifdef OS_WINDOWS
  541. HANDLE hToken;
  542. TOKEN_PRIVILEGES tp;
  543. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  544. tp.PrivilegeCount = 1;
  545. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  546. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1;
  547. if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1;
  548. map_address = (void *)VirtualAlloc(address,
  549. BUFFER_SIZE,
  550. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  551. PAGE_READWRITE);
  552. AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL);
  553. if (map_address == (void *)NULL) map_address = (void *)-1;
  554. #endif
  555. if (map_address != (void *)-1){
  556. release_info[release_pos].address = map_address;
  557. release_info[release_pos].func = alloc_hugetlb_free;
  558. release_pos ++;
  559. }
  560. return map_address;
  561. }
  562. #endif
  563. #ifdef ALLOC_HUGETLBFILE
  564. static int hugetlb_pid = 0;
  565. static void alloc_hugetlbfile_free(struct release_t *release){
  566. if (munmap(release -> address, BUFFER_SIZE)) {
  567. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  568. }
  569. if (close(release -> attr)) {
  570. printf("OpenBLAS : HugeTLBfs close failed.\n");
  571. }
  572. }
  573. static void *alloc_hugetlbfile(void *address){
  574. void *map_address = (void *)-1;
  575. int fd;
  576. char filename[64];
  577. if (!hugetlb_pid) hugetlb_pid = getpid();
  578. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  579. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  580. return (void *)-1;
  581. }
  582. unlink(filename);
  583. map_address = mmap(address, BUFFER_SIZE,
  584. PROT_READ | PROT_WRITE,
  585. MAP_SHARED,
  586. fd, 0);
  587. if (map_address != (void *)-1) {
  588. release_info[release_pos].address = map_address;
  589. release_info[release_pos].attr = fd;
  590. release_info[release_pos].func = alloc_hugetlbfile_free;
  591. release_pos ++;
  592. }
  593. return map_address;
  594. }
  595. #endif
  596. /* Global lock for memory allocation */
  597. #if defined(USE_PTHREAD_LOCK)
  598. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  599. #elif defined(USE_PTHREAD_SPINLOCK)
  600. static pthread_spinlock_t alloc_lock = 0;
  601. #else
  602. static BLASULONG alloc_lock = 0UL;
  603. #endif
  604. #ifdef SEEK_ADDRESS
  605. static BLASULONG base_address = 0UL;
  606. #else
  607. static BLASULONG base_address = BASE_ADDRESS;
  608. #endif
  609. static volatile struct {
  610. BLASULONG lock;
  611. void *addr;
  612. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  613. int pos;
  614. #endif
  615. int used;
  616. #ifndef __64BIT__
  617. char dummy[48];
  618. #else
  619. char dummy[40];
  620. #endif
  621. } memory[NUM_BUFFERS];
  622. static int memory_initialized = 0;
  623. static void gotoblas_memory_init(void);
  624. /* Memory allocation routine */
  625. /* procpos ... indicates where it comes from */
  626. /* 0 : Level 3 functions */
  627. /* 1 : Level 2 functions */
  628. /* 2 : Thread */
  629. void *blas_memory_alloc(int procpos){
  630. int position;
  631. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  632. int mypos;
  633. #endif
  634. void *map_address;
  635. void *(*memoryalloc[])(void *address) = {
  636. #ifdef ALLOC_DEVICEDRIVER
  637. alloc_devicedirver,
  638. #endif
  639. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  640. alloc_hugetlb,
  641. #endif
  642. #ifdef ALLOC_SHM
  643. alloc_shm,
  644. #endif
  645. #ifdef ALLOC_MMAP
  646. alloc_mmap,
  647. #endif
  648. #ifdef ALLOC_QALLOC
  649. alloc_qalloc,
  650. #endif
  651. #ifdef ALLOC_WINDOWS
  652. alloc_windows,
  653. #endif
  654. #ifdef ALLOC_MALLOC
  655. alloc_malloc,
  656. #endif
  657. NULL,
  658. };
  659. void *(**func)(void *address);
  660. if (!memory_initialized) {
  661. LOCK_COMMAND(&alloc_lock);
  662. if (!memory_initialized) {
  663. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  664. for (position = 0; position < NUM_BUFFERS; position ++){
  665. memory[position].addr = (void *)0;
  666. memory[position].pos = -1;
  667. memory[position].used = 0;
  668. memory[position].lock = 0;
  669. }
  670. #endif
  671. #ifdef DYNAMIC_ARCH
  672. gotoblas_dynamic_init();
  673. #endif
  674. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  675. gotoblas_affinity_init();
  676. #endif
  677. #ifdef SMP
  678. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  679. #endif
  680. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
  681. #ifndef DYNAMIC_ARCH
  682. blas_set_parameter();
  683. #endif
  684. #endif
  685. memory_initialized = 1;
  686. }
  687. UNLOCK_COMMAND(&alloc_lock);
  688. }
  689. #ifdef DEBUG
  690. printf("Alloc Start ...\n");
  691. #endif
  692. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  693. mypos = WhereAmI();
  694. position = mypos;
  695. while (position > NUM_BUFFERS) position >>= 1;
  696. do {
  697. if (!memory[position].used && (memory[position].pos == mypos)) {
  698. blas_lock(&memory[position].lock);
  699. if (!memory[position].used) goto allocation;
  700. blas_unlock(&memory[position].lock);
  701. }
  702. position ++;
  703. } while (position < NUM_BUFFERS);
  704. #endif
  705. position = 0;
  706. do {
  707. if (!memory[position].used) {
  708. blas_lock(&memory[position].lock);
  709. if (!memory[position].used) goto allocation;
  710. blas_unlock(&memory[position].lock);
  711. }
  712. position ++;
  713. } while (position < NUM_BUFFERS);
  714. goto error;
  715. allocation :
  716. #ifdef DEBUG
  717. printf(" Position -> %d\n", position);
  718. #endif
  719. memory[position].used = 1;
  720. blas_unlock(&memory[position].lock);
  721. if (!memory[position].addr) {
  722. do {
  723. #ifdef DEBUG
  724. printf("Allocation Start : %lx\n", base_address);
  725. #endif
  726. map_address = (void *)-1;
  727. func = &memoryalloc[0];
  728. while ((func != NULL) && (map_address == (void *) -1)) {
  729. map_address = (*func)((void *)base_address);
  730. #ifdef ALLOC_DEVICEDRIVER
  731. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  732. fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
  733. }
  734. #endif
  735. #ifdef ALLOC_HUGETLBFILE
  736. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  737. #ifndef OS_WINDOWS
  738. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  739. #endif
  740. }
  741. #endif
  742. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  743. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  744. #endif
  745. func ++;
  746. }
  747. #ifdef DEBUG
  748. printf(" Success -> %08lx\n", map_address);
  749. #endif
  750. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  751. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  752. } while ((BLASLONG)map_address == -1);
  753. memory[position].addr = map_address;
  754. #ifdef DEBUG
  755. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  756. #endif
  757. }
  758. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  759. if (memory[position].pos == -1) memory[position].pos = mypos;
  760. #endif
  761. #ifdef DYNAMIC_ARCH
  762. if (memory_initialized == 1) {
  763. LOCK_COMMAND(&alloc_lock);
  764. if (memory_initialized == 1) {
  765. if (!gotoblas) gotoblas_dynamic_init();
  766. memory_initialized = 2;
  767. }
  768. UNLOCK_COMMAND(&alloc_lock);
  769. }
  770. #endif
  771. #ifdef DEBUG
  772. printf("Mapped : %p %3d\n\n",
  773. (void *)memory[position].addr, position);
  774. #endif
  775. return (void *)memory[position].addr;
  776. error:
  777. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  778. return NULL;
  779. }
  780. void blas_memory_free(void *free_area){
  781. int position;
  782. #ifdef DEBUG
  783. printf("Unmapped Start : %p ...\n", free_area);
  784. #endif
  785. position = 0;
  786. while ((memory[position].addr != free_area)
  787. && (position < NUM_BUFFERS)) position++;
  788. if (memory[position].addr != free_area) goto error;
  789. #ifdef DEBUG
  790. printf(" Position : %d\n", position);
  791. #endif
  792. memory[position].used = 0;
  793. #ifdef DEBUG
  794. printf("Unmap Succeeded.\n\n");
  795. #endif
  796. return;
  797. error:
  798. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  799. #ifdef DEBUG
  800. for (position = 0; position < NUM_BUFFERS; position++)
  801. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  802. #endif
  803. return;
  804. }
  805. void blas_shutdown(void){
  806. int pos;
  807. #ifdef SMP
  808. BLASFUNC(blas_thread_shutdown)();
  809. #endif
  810. LOCK_COMMAND(&alloc_lock);
  811. for (pos = 0; pos < release_pos; pos ++) {
  812. release_info[pos].func(&release_info[pos]);
  813. }
  814. #ifdef SEEK_ADDRESS
  815. base_address = 0UL;
  816. #else
  817. base_address = BASE_ADDRESS;
  818. #endif
  819. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  820. memory[pos].addr = (void *)0;
  821. memory[pos].used = 0;
  822. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  823. memory[pos].pos = -1;
  824. #endif
  825. memory[pos].lock = 0;
  826. }
  827. UNLOCK_COMMAND(&alloc_lock);
  828. return;
  829. }
  830. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  831. #ifdef SMP
  832. #if defined(USE_PTHREAD_LOCK)
  833. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  834. #elif defined(USE_PTHREAD_SPINLOCK)
  835. static pthread_spinlock_t init_lock = 0;
  836. #else
  837. static BLASULONG init_lock = 0UL;
  838. #endif
  839. #endif
  840. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  841. void *sa, void *sb, BLASLONG pos) {
  842. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  843. size_t size;
  844. BLASULONG buffer;
  845. size = BUFFER_SIZE - PAGESIZE;
  846. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  847. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  848. if (hot_alloc != 2) {
  849. #endif
  850. #ifdef SMP
  851. LOCK_COMMAND(&init_lock);
  852. #endif
  853. while (size > 0) {
  854. *(int *)buffer = size;
  855. buffer += PAGESIZE;
  856. size -= PAGESIZE;
  857. }
  858. #ifdef SMP
  859. UNLOCK_COMMAND(&init_lock);
  860. #endif
  861. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  862. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  863. while (size > 0) {
  864. *(int *)buffer = size;
  865. buffer += 64;
  866. size -= 64;
  867. }
  868. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  869. }
  870. #endif
  871. #endif
  872. }
  873. #ifdef SMP
  874. static void _init_thread_memory(void *buffer) {
  875. blas_queue_t queue[MAX_CPU_NUMBER];
  876. int num_cpu;
  877. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  878. blas_queue_init(&queue[num_cpu]);
  879. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  880. queue[num_cpu].routine = &_touch_memory;
  881. queue[num_cpu].args = NULL;
  882. queue[num_cpu].next = &queue[num_cpu + 1];
  883. }
  884. queue[num_cpu - 1].next = NULL;
  885. queue[0].sa = buffer;
  886. exec_blas(num_cpu, queue);
  887. }
  888. #endif
  889. static void gotoblas_memory_init(void) {
  890. void *buffer;
  891. hot_alloc = 1;
  892. buffer = (void *)blas_memory_alloc(0);
  893. #ifdef SMP
  894. if (blas_cpu_number == 0) blas_get_cpu_number();
  895. #ifdef SMP_SERVER
  896. if (blas_server_avail == 0) blas_thread_init();
  897. #endif
  898. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  899. #else
  900. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  901. #endif
  902. blas_memory_free(buffer);
  903. }
  904. #endif
  905. /* Initialization for all function; this function should be called before main */
  906. static int gotoblas_initialized = 0;
  907. void CONSTRUCTOR gotoblas_init(void) {
  908. if (gotoblas_initialized) return;
  909. #ifdef PROFILE
  910. moncontrol (0);
  911. #endif
  912. #ifdef DYNAMIC_ARCH
  913. gotoblas_dynamic_init();
  914. #endif
  915. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  916. gotoblas_affinity_init();
  917. #endif
  918. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  919. gotoblas_memory_init();
  920. #endif
  921. #ifdef SMP
  922. if (blas_cpu_number == 0) blas_get_cpu_number();
  923. #ifdef SMP_SERVER
  924. if (blas_server_avail == 0) blas_thread_init();
  925. #endif
  926. #endif
  927. #ifdef FUNCTION_PROFILE
  928. gotoblas_profile_init();
  929. #endif
  930. gotoblas_initialized = 1;
  931. #ifdef PROFILE
  932. moncontrol (1);
  933. #endif
  934. }
  935. void DESTRUCTOR gotoblas_quit(void) {
  936. if (gotoblas_initialized == 0) return;
  937. #ifdef PROFILE
  938. moncontrol (0);
  939. #endif
  940. #ifdef FUNCTION_PROFILE
  941. gotoblas_profile_quit();
  942. #endif
  943. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  944. gotoblas_affinity_quit();
  945. #endif
  946. #ifdef DYNAMIC_ARCH
  947. gotoblas_dynamic_quit();
  948. #endif
  949. gotoblas_initialized = 0;
  950. #ifdef PROFILE
  951. moncontrol (1);
  952. #endif
  953. blas_shutdown();
  954. }
  955. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  956. /* Don't call me; this is just work around for PGI / Sun bug */
  957. void gotoblas_dummy_for_PGI(void) {
  958. gotoblas_init();
  959. gotoblas_quit();
  960. #if 0
  961. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  962. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  963. #else
  964. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  965. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  966. #endif
  967. }
  968. #endif