You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 30 kB


  1. /*****************************************************************************
  2. Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************/
  28. /* Copyright 2009, 2010 The University of Texas at Austin. */
  29. /* All rights reserved. */
  30. /* */
  31. /* Redistribution and use in source and binary forms, with or */
  32. /* without modification, are permitted provided that the following */
  33. /* conditions are met: */
  34. /* */
  35. /* 1. Redistributions of source code must retain the above */
  36. /* copyright notice, this list of conditions and the following */
  37. /* disclaimer. */
  38. /* */
  39. /* 2. Redistributions in binary form must reproduce the above */
  40. /* copyright notice, this list of conditions and the following */
  41. /* disclaimer in the documentation and/or other materials */
  42. /* provided with the distribution. */
  43. /* */
  44. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  45. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  46. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  47. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  48. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  49. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  50. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  51. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  52. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  53. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  54. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  55. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  56. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  57. /* POSSIBILITY OF SUCH DAMAGE. */
  58. /* */
  59. /* The views and conclusions contained in the software and */
  60. /* documentation are those of the authors and should not be */
  61. /* interpreted as representing official policies, either expressed */
  62. /* or implied, of The University of Texas at Austin. */
  63. /*********************************************************************/
  64. //#undef DEBUG
  65. #include "common.h"
  66. #include <errno.h>
  67. #ifdef OS_WINDOWS
  68. #define ALLOC_WINDOWS
  69. #ifndef MEM_LARGE_PAGES
  70. #define MEM_LARGE_PAGES 0x20000000
  71. #endif
  72. #else
  73. #define ALLOC_MMAP
  74. #define ALLOC_MALLOC
  75. #endif
  76. #include <stdlib.h>
  77. #include <stdio.h>
  78. #include <fcntl.h>
  79. #ifndef OS_WINDOWS
  80. #include <sys/mman.h>
  81. #include <sys/shm.h>
  82. #include <sys/ipc.h>
  83. #endif
  84. #include <sys/types.h>
  85. #ifdef OS_LINUX
  86. #include <sys/sysinfo.h>
  87. #include <sched.h>
  88. #include <errno.h>
  89. #include <linux/unistd.h>
  90. #include <sys/syscall.h>
  91. #endif
  92. #if defined(OS_FreeBSD) || defined(OS_Darwin)
  93. #include <sys/sysctl.h>
  94. #endif
  95. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  96. #include <conio.h>
  97. #undef printf
  98. #define printf _cprintf
  99. #endif
  100. #ifdef OS_LINUX
  101. #ifndef MPOL_PREFERRED
  102. #define MPOL_PREFERRED 1
  103. #endif
  104. #endif
  105. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  106. #define NO_WARMUP
  107. #endif
  108. #ifdef ALLOC_HUGETLB
  109. #define SHM_HUGETLB 04000
  110. #endif
  111. #ifndef FIXED_PAGESIZE
  112. #define FIXED_PAGESIZE 4096
  113. #endif
  114. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  115. #define CONSTRUCTOR __attribute__ ((constructor))
  116. #define DESTRUCTOR __attribute__ ((destructor))
  117. #ifdef DYNAMIC_ARCH
  118. gotoblas_t *gotoblas = NULL;
  119. #endif
  120. #ifndef SMP
  121. #define blas_cpu_number 1
  122. #define blas_num_threads 1
  123. /* Dummy Function */
  124. int goto_get_num_procs (void) { return 1;};
  125. void goto_set_num_threads(int num_threads) {};
  126. #else
  127. #ifdef OS_LINUX
  128. #ifndef NO_AFFINITY
  129. int get_num_procs(void);
  130. #else
  131. int get_num_procs(void) {
  132. static int nums = 0;
  133. if (!nums) nums = get_nprocs();
  134. return nums;
  135. }
  136. #endif
  137. #endif
  138. #ifdef OS_WINDOWS
  139. int get_num_procs(void) {
  140. static int nums = 0;
  141. if (nums == 0) {
  142. SYSTEM_INFO sysinfo;
  143. GetSystemInfo(&sysinfo);
  144. nums = sysinfo.dwNumberOfProcessors;
  145. }
  146. return nums;
  147. }
  148. #endif
  149. #if defined(OS_FreeBSD) || defined(OS_Darwin)
  150. int get_num_procs(void) {
  151. static int nums = 0;
  152. int m[2];
  153. size_t len;
  154. if (nums == 0) {
  155. m[0] = CTL_HW;
  156. m[1] = HW_NCPU;
  157. len = sizeof(int);
  158. sysctl(m, 2, &nums, &len, NULL, 0);
  159. }
  160. return nums;
  161. }
  162. #endif
  163. int blas_cpu_number = 0;
  164. int blas_num_threads = 0;
  165. int goto_get_num_procs (void) {
  166. return blas_cpu_number;
  167. }
  168. int blas_get_cpu_number(void){
  169. char *p;
  170. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  171. int max_num;
  172. #endif
  173. int blas_goto_num = 0;
  174. int blas_omp_num = 0;
  175. if (blas_num_threads) return blas_num_threads;
  176. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  177. max_num = get_num_procs();
  178. #endif
  179. blas_goto_num = 0;
  180. #ifndef USE_OPENMP
  181. p = getenv("OPENBLAS_NUM_THREADS");
  182. if (p) blas_goto_num = atoi(p);
  183. if (blas_goto_num < 0) blas_goto_num = 0;
  184. if (blas_goto_num == 0) {
  185. p = getenv("GOTO_NUM_THREADS");
  186. if (p) blas_goto_num = atoi(p);
  187. if (blas_goto_num < 0) blas_goto_num = 0;
  188. }
  189. #endif
  190. blas_omp_num = 0;
  191. p = getenv("OMP_NUM_THREADS");
  192. if (p) blas_omp_num = atoi(p);
  193. if (blas_omp_num < 0) blas_omp_num = 0;
  194. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  195. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  196. else blas_num_threads = MAX_CPU_NUMBER;
  197. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  198. if (blas_num_threads > max_num) blas_num_threads = max_num;
  199. #endif
  200. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  201. #ifdef DEBUG
  202. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  203. #endif
  204. blas_cpu_number = blas_num_threads;
  205. return blas_num_threads;
  206. }
  207. #endif
  208. struct release_t {
  209. void *address;
  210. void (*func)(struct release_t *);
  211. long attr;
  212. };
  213. int hugetlb_allocated = 0;
  214. static struct release_t release_info[NUM_BUFFERS];
  215. static int release_pos = 0;
  216. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  217. static int hot_alloc = 0;
  218. #endif
  219. #ifdef ALLOC_MMAP
  220. static void alloc_mmap_free(struct release_t *release){
  221. if (munmap(release -> address, BUFFER_SIZE)) {
  222. printf("OpenBLAS : munmap failed\n");
  223. }
  224. }
  225. #ifdef NO_WARMUP
  226. static void *alloc_mmap(void *address){
  227. void *map_address;
  228. if (address){
  229. map_address = mmap(address,
  230. BUFFER_SIZE,
  231. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  232. } else {
  233. map_address = mmap(address,
  234. BUFFER_SIZE,
  235. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  236. }
  237. if (map_address != (void *)-1) {
  238. release_info[release_pos].address = map_address;
  239. release_info[release_pos].func = alloc_mmap_free;
  240. release_pos ++;
  241. }
  242. #ifdef OS_LINUX
  243. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  244. #endif
  245. return map_address;
  246. }
  247. #else
  248. #define BENCH_ITERATION 4
  249. #define SCALING 2
  250. static inline BLASULONG run_bench(BLASULONG address, long size) {
  251. BLASULONG original, *p;
  252. BLASULONG start, stop, min;
  253. int iter, i, count;
  254. min = (BLASULONG)-1;
  255. original = *(BLASULONG *)(address + size - PAGESIZE);
  256. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  257. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  258. p = (BLASULONG *)address;
  259. count = size / PAGESIZE;
  260. start = rpcc();
  261. for (i = 0; i < count; i ++) {
  262. p = (BLASULONG *)(*p);
  263. }
  264. stop = rpcc();
  265. if (min > stop - start) min = stop - start;
  266. }
  267. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  268. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  269. return min;
  270. }
  271. static void *alloc_mmap(void *address){
  272. void *map_address, *best_address;
  273. BLASULONG best, start, current;
  274. BLASULONG allocsize;
  275. if (address){
  276. /* Just give up use advanced operation */
  277. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  278. #ifdef OS_LINUX
  279. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  280. #endif
  281. } else {
  282. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  283. if (hot_alloc == 0) {
  284. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  285. #ifdef OS_LINUX
  286. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  287. #endif
  288. } else {
  289. #endif
  290. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  291. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  292. if (map_address != (void *)-1) {
  293. #ifdef OS_LINUX
  294. #if 1
  295. //#ifdef DEBUG
  296. int ret=0;
  297. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  298. if(ret==-1){
  299. int errsv=errno;
  300. perror("OpenBLAS alloc_mmap:");
  301. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  302. }
  303. #else
  304. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  305. #endif
  306. #endif
  307. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  308. start = (BLASULONG)map_address;
  309. current = (SCALING - 1) * BUFFER_SIZE;
  310. while(current > 0) {
  311. *(long *)start = (long)start + PAGESIZE;
  312. start += PAGESIZE;
  313. current -= PAGESIZE;
  314. }
  315. *(long *)(start - PAGESIZE) = (BLASULONG)map_address;
  316. start = (BLASULONG)map_address;
  317. best = (BLASULONG)-1;
  318. best_address = map_address;
  319. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  320. current = run_bench(start, allocsize);
  321. if (best > current) {
  322. best = current;
  323. best_address = (void *)start;
  324. }
  325. start += PAGESIZE;
  326. }
  327. if ((BLASULONG)best_address > (BLASULONG)map_address)
  328. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  329. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  330. map_address = best_address;
  331. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  332. hot_alloc = 2;
  333. #endif
  334. }
  335. }
  336. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  337. }
  338. #endif
  339. if (map_address != (void *)-1) {
  340. release_info[release_pos].address = map_address;
  341. release_info[release_pos].func = alloc_mmap_free;
  342. release_pos ++;
  343. }
  344. return map_address;
  345. }
  346. #endif
  347. #endif
  348. #ifdef ALLOC_MALLOC
  349. static void alloc_malloc_free(struct release_t *release){
  350. free(release -> address);
  351. }
  352. static void *alloc_malloc(void *address){
  353. void *map_address;
  354. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  355. if (map_address == (void *)NULL) map_address = (void *)-1;
  356. if (map_address != (void *)-1) {
  357. release_info[release_pos].address = map_address;
  358. release_info[release_pos].func = alloc_malloc_free;
  359. release_pos ++;
  360. }
  361. return map_address;
  362. }
  363. #endif
  364. #ifdef ALLOC_QALLOC
  365. void *qalloc(int flags, size_t bytes);
  366. void *qfree (void *address);
  367. #define QNONCACHE 0x1
  368. #define QCOMMS 0x2
  369. #define QFAST 0x4
  370. static void alloc_qalloc_free(struct release_t *release){
  371. qfree(release -> address);
  372. }
  373. static void *alloc_qalloc(void *address){
  374. void *map_address;
  375. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  376. if (map_address == (void *)NULL) map_address = (void *)-1;
  377. if (map_address != (void *)-1) {
  378. release_info[release_pos].address = map_address;
  379. release_info[release_pos].func = alloc_qalloc_free;
  380. release_pos ++;
  381. }
  382. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  383. }
  384. #endif
  385. #ifdef ALLOC_WINDOWS
  386. static void alloc_windows_free(struct release_t *release){
  387. VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
  388. }
  389. static void *alloc_windows(void *address){
  390. void *map_address;
  391. map_address = VirtualAlloc(address,
  392. BUFFER_SIZE,
  393. MEM_RESERVE | MEM_COMMIT,
  394. PAGE_READWRITE);
  395. if (map_address == (void *)NULL) map_address = (void *)-1;
  396. if (map_address != (void *)-1) {
  397. release_info[release_pos].address = map_address;
  398. release_info[release_pos].func = alloc_windows_free;
  399. release_pos ++;
  400. }
  401. return map_address;
  402. }
  403. #endif
  404. #ifdef ALLOC_DEVICEDRIVER
  405. #ifndef DEVICEDRIVER_NAME
  406. #define DEVICEDRIVER_NAME "/dev/mapper"
  407. #endif
  408. static void alloc_devicedirver_free(struct release_t *release){
  409. if (munmap(release -> address, BUFFER_SIZE)) {
  410. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  411. }
  412. if (close(release -> attr)) {
  413. printf("OpenBLAS : Bugphysarea close failed.\n");
  414. }
  415. }
  416. static void *alloc_devicedirver(void *address){
  417. int fd;
  418. void *map_address;
  419. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  420. return (void *)-1;
  421. }
  422. map_address = mmap(address, BUFFER_SIZE,
  423. PROT_READ | PROT_WRITE,
  424. MAP_FILE | MAP_SHARED,
  425. fd, 0);
  426. if (map_address != (void *)-1) {
  427. release_info[release_pos].address = map_address;
  428. release_info[release_pos].attr = fd;
  429. release_info[release_pos].func = alloc_devicedirver_free;
  430. release_pos ++;
  431. }
  432. return map_address;
  433. }
  434. #endif
  435. #ifdef ALLOC_SHM
  436. static void alloc_shm_free(struct release_t *release){
  437. if (shmdt(release -> address)) {
  438. printf("OpenBLAS : Shared memory unmap failed.\n");
  439. }
  440. }
  441. static void *alloc_shm(void *address){
  442. void *map_address;
  443. int shmid;
  444. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  445. map_address = (void *)shmat(shmid, address, 0);
  446. if (map_address != (void *)-1){
  447. #ifdef OS_LINUX
  448. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  449. #endif
  450. shmctl(shmid, IPC_RMID, 0);
  451. release_info[release_pos].address = map_address;
  452. release_info[release_pos].attr = shmid;
  453. release_info[release_pos].func = alloc_shm_free;
  454. release_pos ++;
  455. }
  456. return map_address;
  457. }
  458. #endif
  459. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  460. static void alloc_hugetlb_free(struct release_t *release){
  461. #if defined(OS_LINUX) || defined(OS_AIX)
  462. if (shmdt(release -> address)) {
  463. printf("OpenBLAS : Hugepage unmap failed.\n");
  464. }
  465. #endif
  466. #ifdef __sun__
  467. munmap(release -> address, BUFFER_SIZE);
  468. #endif
  469. #ifdef OS_WINDOWS
  470. VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
  471. #endif
  472. }
  473. static void *alloc_hugetlb(void *address){
  474. void *map_address = (void *)-1;
  475. #if defined(OS_LINUX) || defined(OS_AIX)
  476. int shmid;
  477. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  478. #ifdef OS_LINUX
  479. SHM_HUGETLB |
  480. #endif
  481. #ifdef OS_AIX
  482. SHM_LGPAGE | SHM_PIN |
  483. #endif
  484. IPC_CREAT | SHM_R | SHM_W);
  485. if (shmid != -1) {
  486. map_address = (void *)shmat(shmid, address, SHM_RND);
  487. #ifdef OS_LINUX
  488. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  489. #endif
  490. if (map_address != (void *)-1){
  491. shmctl(shmid, IPC_RMID, 0);
  492. }
  493. }
  494. #endif
  495. #ifdef __sun__
  496. struct memcntl_mha mha;
  497. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  498. mha.mha_flags = 0;
  499. mha.mha_pagesize = HUGE_PAGESIZE;
  500. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  501. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  502. #endif
  503. #ifdef OS_WINDOWS
  504. HANDLE hToken;
  505. TOKEN_PRIVILEGES tp;
  506. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  507. tp.PrivilegeCount = 1;
  508. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  509. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1;
  510. if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1;
  511. map_address = (void *)VirtualAlloc(address,
  512. BUFFER_SIZE,
  513. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  514. PAGE_READWRITE);
  515. AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL);
  516. if (map_address == (void *)NULL) map_address = (void *)-1;
  517. #endif
  518. if (map_address != (void *)-1){
  519. release_info[release_pos].address = map_address;
  520. release_info[release_pos].func = alloc_hugetlb_free;
  521. release_pos ++;
  522. }
  523. return map_address;
  524. }
  525. #endif
  526. #ifdef ALLOC_HUGETLBFILE
  527. static int hugetlb_pid = 0;
  528. static void alloc_hugetlbfile_free(struct release_t *release){
  529. if (munmap(release -> address, BUFFER_SIZE)) {
  530. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  531. }
  532. if (close(release -> attr)) {
  533. printf("OpenBLAS : HugeTLBfs close failed.\n");
  534. }
  535. }
  536. static void *alloc_hugetlbfile(void *address){
  537. void *map_address = (void *)-1;
  538. int fd;
  539. char filename[64];
  540. if (!hugetlb_pid) hugetlb_pid = getpid();
  541. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  542. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  543. return (void *)-1;
  544. }
  545. unlink(filename);
  546. map_address = mmap(address, BUFFER_SIZE,
  547. PROT_READ | PROT_WRITE,
  548. MAP_SHARED,
  549. fd, 0);
  550. if (map_address != (void *)-1) {
  551. release_info[release_pos].address = map_address;
  552. release_info[release_pos].attr = fd;
  553. release_info[release_pos].func = alloc_hugetlbfile_free;
  554. release_pos ++;
  555. }
  556. return map_address;
  557. }
  558. #endif
  559. /* Global lock for memory allocation */
  560. #if defined(USE_PTHREAD_LOCK)
  561. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  562. #elif defined(USE_PTHREAD_SPINLOCK)
  563. static pthread_spinlock_t alloc_lock = 0;
  564. #else
  565. static BLASULONG alloc_lock = 0UL;
  566. #endif
  567. #ifdef SEEK_ADDRESS
  568. static BLASULONG base_address = 0UL;
  569. #else
  570. static BLASULONG base_address = BASE_ADDRESS;
  571. #endif
  572. static volatile struct {
  573. BLASULONG lock;
  574. void *addr;
  575. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  576. int pos;
  577. #endif
  578. int used;
  579. #ifndef __64BIT__
  580. char dummy[48];
  581. #else
  582. char dummy[40];
  583. #endif
  584. } memory[NUM_BUFFERS];
  585. static int memory_initialized = 0;
  586. static void gotoblas_memory_init(void);
  587. /* Memory allocation routine */
  588. /* procpos ... indicates where it comes from */
  589. /* 0 : Level 3 functions */
  590. /* 1 : Level 2 functions */
  591. /* 2 : Thread */
  592. void *blas_memory_alloc(int procpos){
  593. int position;
  594. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  595. int mypos;
  596. #endif
  597. void *map_address;
  598. void *(*memoryalloc[])(void *address) = {
  599. #ifdef ALLOC_DEVICEDRIVER
  600. alloc_devicedirver,
  601. #endif
  602. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  603. alloc_hugetlb,
  604. #endif
  605. #ifdef ALLOC_SHM
  606. alloc_shm,
  607. #endif
  608. #ifdef ALLOC_MMAP
  609. alloc_mmap,
  610. #endif
  611. #ifdef ALLOC_QALLOC
  612. alloc_qalloc,
  613. #endif
  614. #ifdef ALLOC_WINDOWS
  615. alloc_windows,
  616. #endif
  617. #ifdef ALLOC_MALLOC
  618. alloc_malloc,
  619. #endif
  620. NULL,
  621. };
  622. void *(**func)(void *address);
  623. if (!memory_initialized) {
  624. LOCK_COMMAND(&alloc_lock);
  625. if (!memory_initialized) {
  626. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  627. for (position = 0; position < NUM_BUFFERS; position ++){
  628. memory[position].addr = (void *)0;
  629. memory[position].pos = -1;
  630. memory[position].used = 0;
  631. memory[position].lock = 0;
  632. }
  633. #endif
  634. #ifdef DYNAMIC_ARCH
  635. gotoblas_dynamic_init();
  636. #endif
  637. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  638. gotoblas_affinity_init();
  639. #endif
  640. #ifdef SMP
  641. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  642. #endif
  643. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
  644. #ifndef DYNAMIC_ARCH
  645. blas_set_parameter();
  646. #endif
  647. #endif
  648. memory_initialized = 1;
  649. }
  650. UNLOCK_COMMAND(&alloc_lock);
  651. }
  652. #ifdef DEBUG
  653. printf("Alloc Start ...\n");
  654. #endif
  655. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  656. mypos = WhereAmI();
  657. position = mypos;
  658. while (position > NUM_BUFFERS) position >>= 1;
  659. do {
  660. if (!memory[position].used && (memory[position].pos == mypos)) {
  661. blas_lock(&memory[position].lock);
  662. if (!memory[position].used) goto allocation;
  663. blas_unlock(&memory[position].lock);
  664. }
  665. position ++;
  666. } while (position < NUM_BUFFERS);
  667. #endif
  668. position = 0;
  669. do {
  670. if (!memory[position].used) {
  671. blas_lock(&memory[position].lock);
  672. if (!memory[position].used) goto allocation;
  673. blas_unlock(&memory[position].lock);
  674. }
  675. position ++;
  676. } while (position < NUM_BUFFERS);
  677. goto error;
  678. allocation :
  679. #ifdef DEBUG
  680. printf(" Position -> %d\n", position);
  681. #endif
  682. memory[position].used = 1;
  683. blas_unlock(&memory[position].lock);
  684. if (!memory[position].addr) {
  685. do {
  686. #ifdef DEBUG
  687. printf("Allocation Start : %lx\n", base_address);
  688. #endif
  689. map_address = (void *)-1;
  690. func = &memoryalloc[0];
  691. while ((func != NULL) && (map_address == (void *) -1)) {
  692. map_address = (*func)((void *)base_address);
  693. #ifdef ALLOC_DEVICEDRIVER
  694. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  695. fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
  696. }
  697. #endif
  698. #ifdef ALLOC_HUGETLBFILE
  699. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  700. #ifndef OS_WINDOWS
  701. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  702. #endif
  703. }
  704. #endif
  705. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  706. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  707. #endif
  708. func ++;
  709. }
  710. #ifdef DEBUG
  711. printf(" Success -> %08lx\n", map_address);
  712. #endif
  713. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  714. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  715. } while ((BLASLONG)map_address == -1);
  716. memory[position].addr = map_address;
  717. #ifdef DEBUG
  718. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  719. #endif
  720. }
  721. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  722. if (memory[position].pos == -1) memory[position].pos = mypos;
  723. #endif
  724. #ifdef DYNAMIC_ARCH
  725. if (memory_initialized == 1) {
  726. LOCK_COMMAND(&alloc_lock);
  727. if (memory_initialized == 1) {
  728. if (!gotoblas) gotoblas_dynamic_init();
  729. memory_initialized = 2;
  730. }
  731. UNLOCK_COMMAND(&alloc_lock);
  732. }
  733. #endif
  734. #ifdef DEBUG
  735. printf("Mapped : %p %3d\n\n",
  736. (void *)memory[position].addr, position);
  737. #endif
  738. return (void *)memory[position].addr;
  739. error:
  740. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  741. return NULL;
  742. }
  743. void blas_memory_free(void *free_area){
  744. int position;
  745. #ifdef DEBUG
  746. printf("Unmapped Start : %p ...\n", free_area);
  747. #endif
  748. position = 0;
  749. while ((memory[position].addr != free_area)
  750. && (position < NUM_BUFFERS)) position++;
  751. if (memory[position].addr != free_area) goto error;
  752. #ifdef DEBUG
  753. printf(" Position : %d\n", position);
  754. #endif
  755. memory[position].used = 0;
  756. #ifdef DEBUG
  757. printf("Unmap Succeeded.\n\n");
  758. #endif
  759. return;
  760. error:
  761. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  762. #ifdef DEBUG
  763. for (position = 0; position < NUM_BUFFERS; position++)
  764. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  765. #endif
  766. return;
  767. }
  768. void blas_shutdown(void){
  769. int pos;
  770. #ifdef SMP
  771. BLASFUNC(blas_thread_shutdown)();
  772. #endif
  773. LOCK_COMMAND(&alloc_lock);
  774. for (pos = 0; pos < release_pos; pos ++) {
  775. release_info[pos].func(&release_info[pos]);
  776. }
  777. #ifdef SEEK_ADDRESS
  778. base_address = 0UL;
  779. #else
  780. base_address = BASE_ADDRESS;
  781. #endif
  782. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  783. memory[pos].addr = (void *)0;
  784. memory[pos].used = 0;
  785. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  786. memory[pos].pos = -1;
  787. #endif
  788. memory[pos].lock = 0;
  789. }
  790. UNLOCK_COMMAND(&alloc_lock);
  791. return;
  792. }
  793. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  794. #ifdef SMP
  795. #if defined(USE_PTHREAD_LOCK)
  796. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  797. #elif defined(USE_PTHREAD_SPINLOCK)
  798. static pthread_spinlock_t init_lock = 0;
  799. #else
  800. static BLASULONG init_lock = 0UL;
  801. #endif
  802. #endif
  803. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  804. void *sa, void *sb, BLASLONG pos) {
  805. #ifndef ARCH_POWER
  806. long size;
  807. BLASULONG buffer;
  808. size = BUFFER_SIZE - PAGESIZE;
  809. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  810. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  811. if (hot_alloc != 2) {
  812. #endif
  813. #ifdef SMP
  814. LOCK_COMMAND(&init_lock);
  815. #endif
  816. while (size > 0) {
  817. *(int *)buffer = size;
  818. buffer += PAGESIZE;
  819. size -= PAGESIZE;
  820. }
  821. #ifdef SMP
  822. UNLOCK_COMMAND(&init_lock);
  823. #endif
  824. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  825. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  826. while (size > 0) {
  827. *(int *)buffer = size;
  828. buffer += 64;
  829. size -= 64;
  830. }
  831. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  832. }
  833. #endif
  834. #endif
  835. }
  836. #ifdef SMP
  837. static void _init_thread_memory(void *buffer) {
  838. blas_queue_t queue[MAX_CPU_NUMBER];
  839. int num_cpu;
  840. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  841. blas_queue_init(&queue[num_cpu]);
  842. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  843. queue[num_cpu].routine = &_touch_memory;
  844. queue[num_cpu].args = NULL;
  845. queue[num_cpu].next = &queue[num_cpu + 1];
  846. }
  847. queue[num_cpu - 1].next = NULL;
  848. queue[0].sa = buffer;
  849. exec_blas(num_cpu, queue);
  850. }
  851. #endif
  852. static void gotoblas_memory_init(void) {
  853. void *buffer;
  854. hot_alloc = 1;
  855. buffer = (void *)blas_memory_alloc(0);
  856. #ifdef SMP
  857. if (blas_cpu_number == 0) blas_get_cpu_number();
  858. #ifdef SMP_SERVER
  859. if (blas_server_avail == 0) blas_thread_init();
  860. #endif
  861. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  862. #else
  863. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  864. #endif
  865. blas_memory_free(buffer);
  866. }
  867. #endif
  868. /* Initialization for all function; this function should be called before main */
  869. static int gotoblas_initialized = 0;
  870. void CONSTRUCTOR gotoblas_init(void) {
  871. if (gotoblas_initialized) return;
  872. #ifdef PROFILE
  873. moncontrol (0);
  874. #endif
  875. #ifdef DYNAMIC_ARCH
  876. gotoblas_dynamic_init();
  877. #endif
  878. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  879. gotoblas_affinity_init();
  880. #endif
  881. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  882. gotoblas_memory_init();
  883. #endif
  884. #ifdef SMP
  885. if (blas_cpu_number == 0) blas_get_cpu_number();
  886. #ifdef SMP_SERVER
  887. if (blas_server_avail == 0) blas_thread_init();
  888. #endif
  889. #endif
  890. #ifdef FUNCTION_PROFILE
  891. gotoblas_profile_init();
  892. #endif
  893. gotoblas_initialized = 1;
  894. #ifdef PROFILE
  895. moncontrol (1);
  896. #endif
  897. }
  898. void DESTRUCTOR gotoblas_quit(void) {
  899. if (gotoblas_initialized == 0) return;
  900. #ifdef PROFILE
  901. moncontrol (0);
  902. #endif
  903. #ifdef FUNCTION_PROFILE
  904. gotoblas_profile_quit();
  905. #endif
  906. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  907. gotoblas_affinity_quit();
  908. #endif
  909. #ifdef DYNAMIC_ARCH
  910. gotoblas_dynamic_quit();
  911. #endif
  912. gotoblas_initialized = 0;
  913. #ifdef PROFILE
  914. moncontrol (1);
  915. #endif
  916. }
  917. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  918. /* Don't call me; this is just work around for PGI / Sun bug */
  919. void gotoblas_dummy_for_PGI(void) {
  920. gotoblas_init();
  921. gotoblas_quit();
  922. #if 0
  923. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  924. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  925. #else
  926. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  927. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  928. #endif
  929. }
  930. #endif

OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.