You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 29 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289
  1. /*****************************************************************************
  2. Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the ISCAS nor the names of its contributors may
  14. be used to endorse or promote products derived from this software
  15. without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************/
  28. /* Copyright 2009, 2010 The University of Texas at Austin. */
  29. /* All rights reserved. */
  30. /* */
  31. /* Redistribution and use in source and binary forms, with or */
  32. /* without modification, are permitted provided that the following */
  33. /* conditions are met: */
  34. /* */
  35. /* 1. Redistributions of source code must retain the above */
  36. /* copyright notice, this list of conditions and the following */
  37. /* disclaimer. */
  38. /* */
  39. /* 2. Redistributions in binary form must reproduce the above */
  40. /* copyright notice, this list of conditions and the following */
  41. /* disclaimer in the documentation and/or other materials */
  42. /* provided with the distribution. */
  43. /* */
  44. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  45. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  46. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  47. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  48. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  49. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  50. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  51. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  52. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  53. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  54. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  55. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  56. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  57. /* POSSIBILITY OF SUCH DAMAGE. */
  58. /* */
  59. /* The views and conclusions contained in the software and */
  60. /* documentation are those of the authors and should not be */
  61. /* interpreted as representing official policies, either expressed */
  62. /* or implied, of The University of Texas at Austin. */
  63. /*********************************************************************/
  64. #undef DEBUG
  65. #include "common.h"
  66. #ifdef OS_WINDOWS
  67. #define ALLOC_WINDOWS
  68. #ifndef MEM_LARGE_PAGES
  69. #define MEM_LARGE_PAGES 0x20000000
  70. #endif
  71. #else
  72. #define ALLOC_MMAP
  73. #define ALLOC_MALLOC
  74. #endif
  75. #include <stdlib.h>
  76. #include <stdio.h>
  77. #include <fcntl.h>
  78. #ifndef OS_WINDOWS
  79. #include <sys/mman.h>
  80. #include <sys/shm.h>
  81. #include <sys/ipc.h>
  82. #endif
  83. #include <sys/types.h>
  84. #ifdef OS_LINUX
  85. #include <sys/sysinfo.h>
  86. #include <sched.h>
  87. #include <errno.h>
  88. #include <linux/unistd.h>
  89. #include <sys/syscall.h>
  90. #endif
  91. #if defined(OS_FreeBSD) || defined(OS_Darwin)
  92. #include <sys/sysctl.h>
  93. #endif
  94. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  95. #include <conio.h>
  96. #undef printf
  97. #define printf _cprintf
  98. #endif
  99. #ifdef OS_LINUX
  100. #ifndef MPOL_PREFERRED
  101. #define MPOL_PREFERRED 1
  102. #endif
  103. #endif
  104. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  105. #define NO_WARMUP
  106. #endif
  107. #ifdef ALLOC_HUGETLB
  108. #define SHM_HUGETLB 04000
  109. #endif
  110. #ifndef FIXED_PAGESIZE
  111. #define FIXED_PAGESIZE 4096
  112. #endif
  113. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  114. #define CONSTRUCTOR __attribute__ ((constructor))
  115. #define DESTRUCTOR __attribute__ ((destructor))
  116. #ifdef DYNAMIC_ARCH
  117. gotoblas_t *gotoblas = NULL;
  118. #endif
  119. #ifndef SMP
  120. #define blas_cpu_number 1
  121. #define blas_num_threads 1
  122. /* Dummy Function */
  123. int goto_get_num_procs (void) { return 1;};
  124. void goto_set_num_threads(int num_threads) {};
  125. #else
  126. #ifdef OS_LINUX
  127. #ifndef NO_AFFINITY
  128. int get_num_procs(void);
  129. #else
  130. int get_num_procs(void) {
  131. static int nums = 0;
  132. if (!nums) nums = get_nprocs();
  133. return nums;
  134. }
  135. #endif
  136. #endif
  137. #ifdef OS_WINDOWS
  138. int get_num_procs(void) {
  139. static int nums = 0;
  140. if (nums == 0) {
  141. SYSTEM_INFO sysinfo;
  142. GetSystemInfo(&sysinfo);
  143. nums = sysinfo.dwNumberOfProcessors;
  144. }
  145. return nums;
  146. }
  147. #endif
  148. #if defined(OS_FreeBSD) || defined(OS_Darwin)
  149. int get_num_procs(void) {
  150. static int nums = 0;
  151. int m[2];
  152. size_t len;
  153. if (nums == 0) {
  154. m[0] = CTL_HW;
  155. m[1] = HW_NCPU;
  156. len = sizeof(int);
  157. sysctl(m, 2, &nums, &len, NULL, 0);
  158. }
  159. return nums;
  160. }
  161. #endif
  162. int blas_cpu_number = 0;
  163. int blas_num_threads = 0;
  164. int goto_get_num_procs (void) {
  165. return blas_cpu_number;
  166. }
  167. int blas_get_cpu_number(void){
  168. char *p;
  169. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  170. int max_num;
  171. #endif
  172. int blas_goto_num = 0;
  173. int blas_omp_num = 0;
  174. if (blas_num_threads) return blas_num_threads;
  175. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  176. max_num = get_num_procs();
  177. #endif
  178. blas_goto_num = 0;
  179. #ifndef USE_OPENMP
  180. p = getenv("OPENBLAS_NUM_THREADS");
  181. if (p) blas_goto_num = atoi(p);
  182. if (blas_goto_num < 0) blas_goto_num = 0;
  183. #endif
  184. blas_omp_num = 0;
  185. p = getenv("OMP_NUM_THREADS");
  186. if (p) blas_omp_num = atoi(p);
  187. if (blas_omp_num < 0) blas_omp_num = 0;
  188. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  189. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  190. else blas_num_threads = MAX_CPU_NUMBER;
  191. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
  192. if (blas_num_threads > max_num) blas_num_threads = max_num;
  193. #endif
  194. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  195. #ifdef DEBUG
  196. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  197. #endif
  198. blas_cpu_number = blas_num_threads;
  199. return blas_num_threads;
  200. }
  201. #endif
  202. struct release_t {
  203. void *address;
  204. void (*func)(struct release_t *);
  205. long attr;
  206. };
  207. int hugetlb_allocated = 0;
  208. static struct release_t release_info[NUM_BUFFERS];
  209. static int release_pos = 0;
  210. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  211. static int hot_alloc = 0;
  212. #endif
  213. #ifdef ALLOC_MMAP
  214. static void alloc_mmap_free(struct release_t *release){
  215. if (munmap(release -> address, BUFFER_SIZE)) {
  216. printf("OpenBLAS : munmap failed\n");
  217. }
  218. }
  219. #ifdef NO_WARMUP
  220. static void *alloc_mmap(void *address){
  221. void *map_address;
  222. if (address){
  223. map_address = mmap(address,
  224. BUFFER_SIZE,
  225. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  226. } else {
  227. map_address = mmap(address,
  228. BUFFER_SIZE,
  229. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  230. }
  231. if (map_address != (void *)-1) {
  232. release_info[release_pos].address = map_address;
  233. release_info[release_pos].func = alloc_mmap_free;
  234. release_pos ++;
  235. }
  236. #ifdef OS_LINUX
  237. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  238. #endif
  239. return map_address;
  240. }
  241. #else
  242. #define BENCH_ITERATION 4
  243. #define SCALING 2
  244. static inline BLASULONG run_bench(BLASULONG address, long size) {
  245. BLASULONG original, *p;
  246. BLASULONG start, stop, min;
  247. int iter, i, count;
  248. min = (BLASULONG)-1;
  249. original = *(BLASULONG *)(address + size - PAGESIZE);
  250. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  251. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  252. p = (BLASULONG *)address;
  253. count = size / PAGESIZE;
  254. start = rpcc();
  255. for (i = 0; i < count; i ++) {
  256. p = (BLASULONG *)(*p);
  257. }
  258. stop = rpcc();
  259. if (min > stop - start) min = stop - start;
  260. }
  261. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  262. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  263. return min;
  264. }
  265. static void *alloc_mmap(void *address){
  266. void *map_address, *best_address;
  267. BLASULONG best, start, current;
  268. BLASULONG allocsize;
  269. if (address){
  270. /* Just give up use advanced operation */
  271. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  272. #ifdef OS_LINUX
  273. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  274. #endif
  275. } else {
  276. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  277. if (hot_alloc == 0) {
  278. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  279. #ifdef OS_LINUX
  280. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  281. #endif
  282. } else {
  283. #endif
  284. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  285. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  286. if (map_address != (void *)-1) {
  287. #ifdef OS_LINUX
  288. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  289. #endif
  290. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  291. start = (BLASULONG)map_address;
  292. current = (SCALING - 1) * BUFFER_SIZE;
  293. while(current > 0) {
  294. *(long *)start = (long)start + PAGESIZE;
  295. start += PAGESIZE;
  296. current -= PAGESIZE;
  297. }
  298. *(long *)(start - PAGESIZE) = (BLASULONG)map_address;
  299. start = (BLASULONG)map_address;
  300. best = (BLASULONG)-1;
  301. best_address = map_address;
  302. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  303. current = run_bench(start, allocsize);
  304. if (best > current) {
  305. best = current;
  306. best_address = (void *)start;
  307. }
  308. start += PAGESIZE;
  309. }
  310. if ((BLASULONG)best_address > (BLASULONG)map_address)
  311. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  312. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  313. map_address = best_address;
  314. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  315. hot_alloc = 2;
  316. #endif
  317. }
  318. }
  319. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  320. }
  321. #endif
  322. if (map_address != (void *)-1) {
  323. release_info[release_pos].address = map_address;
  324. release_info[release_pos].func = alloc_mmap_free;
  325. release_pos ++;
  326. }
  327. return map_address;
  328. }
  329. #endif
  330. #endif
  331. #ifdef ALLOC_MALLOC
  332. static void alloc_malloc_free(struct release_t *release){
  333. free(release -> address);
  334. }
  335. static void *alloc_malloc(void *address){
  336. void *map_address;
  337. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  338. if (map_address == (void *)NULL) map_address = (void *)-1;
  339. if (map_address != (void *)-1) {
  340. release_info[release_pos].address = map_address;
  341. release_info[release_pos].func = alloc_malloc_free;
  342. release_pos ++;
  343. }
  344. return map_address;
  345. }
  346. #endif
  347. #ifdef ALLOC_QALLOC
  348. void *qalloc(int flags, size_t bytes);
  349. void *qfree (void *address);
  350. #define QNONCACHE 0x1
  351. #define QCOMMS 0x2
  352. #define QFAST 0x4
  353. static void alloc_qalloc_free(struct release_t *release){
  354. qfree(release -> address);
  355. }
  356. static void *alloc_qalloc(void *address){
  357. void *map_address;
  358. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  359. if (map_address == (void *)NULL) map_address = (void *)-1;
  360. if (map_address != (void *)-1) {
  361. release_info[release_pos].address = map_address;
  362. release_info[release_pos].func = alloc_qalloc_free;
  363. release_pos ++;
  364. }
  365. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  366. }
  367. #endif
  368. #ifdef ALLOC_WINDOWS
  369. static void alloc_windows_free(struct release_t *release){
  370. VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
  371. }
  372. static void *alloc_windows(void *address){
  373. void *map_address;
  374. map_address = VirtualAlloc(address,
  375. BUFFER_SIZE,
  376. MEM_RESERVE | MEM_COMMIT,
  377. PAGE_READWRITE);
  378. if (map_address == (void *)NULL) map_address = (void *)-1;
  379. if (map_address != (void *)-1) {
  380. release_info[release_pos].address = map_address;
  381. release_info[release_pos].func = alloc_windows_free;
  382. release_pos ++;
  383. }
  384. return map_address;
  385. }
  386. #endif
  387. #ifdef ALLOC_DEVICEDRIVER
  388. #ifndef DEVICEDRIVER_NAME
  389. #define DEVICEDRIVER_NAME "/dev/mapper"
  390. #endif
  391. static void alloc_devicedirver_free(struct release_t *release){
  392. if (munmap(release -> address, BUFFER_SIZE)) {
  393. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  394. }
  395. if (close(release -> attr)) {
  396. printf("OpenBLAS : Bugphysarea close failed.\n");
  397. }
  398. }
  399. static void *alloc_devicedirver(void *address){
  400. int fd;
  401. void *map_address;
  402. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  403. return (void *)-1;
  404. }
  405. map_address = mmap(address, BUFFER_SIZE,
  406. PROT_READ | PROT_WRITE,
  407. MAP_FILE | MAP_SHARED,
  408. fd, 0);
  409. if (map_address != (void *)-1) {
  410. release_info[release_pos].address = map_address;
  411. release_info[release_pos].attr = fd;
  412. release_info[release_pos].func = alloc_devicedirver_free;
  413. release_pos ++;
  414. }
  415. return map_address;
  416. }
  417. #endif
  418. #ifdef ALLOC_SHM
  419. static void alloc_shm_free(struct release_t *release){
  420. if (shmdt(release -> address)) {
  421. printf("OpenBLAS : Shared memory unmap failed.\n");
  422. }
  423. }
  424. static void *alloc_shm(void *address){
  425. void *map_address;
  426. int shmid;
  427. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  428. map_address = (void *)shmat(shmid, address, 0);
  429. if (map_address != (void *)-1){
  430. #ifdef OS_LINUX
  431. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  432. #endif
  433. shmctl(shmid, IPC_RMID, 0);
  434. release_info[release_pos].address = map_address;
  435. release_info[release_pos].attr = shmid;
  436. release_info[release_pos].func = alloc_shm_free;
  437. release_pos ++;
  438. }
  439. return map_address;
  440. }
  441. #endif
  442. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  443. static void alloc_hugetlb_free(struct release_t *release){
  444. #if defined(OS_LINUX) || defined(OS_AIX)
  445. if (shmdt(release -> address)) {
  446. printf("OpenBLAS : Hugepage unmap failed.\n");
  447. }
  448. #endif
  449. #ifdef __sun__
  450. munmap(release -> address, BUFFER_SIZE);
  451. #endif
  452. #ifdef OS_WINDOWS
  453. VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
  454. #endif
  455. }
  456. static void *alloc_hugetlb(void *address){
  457. void *map_address = (void *)-1;
  458. #if defined(OS_LINUX) || defined(OS_AIX)
  459. int shmid;
  460. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  461. #ifdef OS_LINUX
  462. SHM_HUGETLB |
  463. #endif
  464. #ifdef OS_AIX
  465. SHM_LGPAGE | SHM_PIN |
  466. #endif
  467. IPC_CREAT | SHM_R | SHM_W);
  468. if (shmid != -1) {
  469. map_address = (void *)shmat(shmid, address, SHM_RND);
  470. #ifdef OS_LINUX
  471. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  472. #endif
  473. if (map_address != (void *)-1){
  474. shmctl(shmid, IPC_RMID, 0);
  475. }
  476. }
  477. #endif
  478. #ifdef __sun__
  479. struct memcntl_mha mha;
  480. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  481. mha.mha_flags = 0;
  482. mha.mha_pagesize = HUGE_PAGESIZE;
  483. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  484. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  485. #endif
  486. #ifdef OS_WINDOWS
  487. HANDLE hToken;
  488. TOKEN_PRIVILEGES tp;
  489. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  490. tp.PrivilegeCount = 1;
  491. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  492. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1;
  493. if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1;
  494. map_address = (void *)VirtualAlloc(address,
  495. BUFFER_SIZE,
  496. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  497. PAGE_READWRITE);
  498. AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL);
  499. if (map_address == (void *)NULL) map_address = (void *)-1;
  500. #endif
  501. if (map_address != (void *)-1){
  502. release_info[release_pos].address = map_address;
  503. release_info[release_pos].func = alloc_hugetlb_free;
  504. release_pos ++;
  505. }
  506. return map_address;
  507. }
  508. #endif
  509. #ifdef ALLOC_HUGETLBFILE
  510. static int hugetlb_pid = 0;
  511. static void alloc_hugetlbfile_free(struct release_t *release){
  512. if (munmap(release -> address, BUFFER_SIZE)) {
  513. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  514. }
  515. if (close(release -> attr)) {
  516. printf("OpenBLAS : HugeTLBfs close failed.\n");
  517. }
  518. }
  519. static void *alloc_hugetlbfile(void *address){
  520. void *map_address = (void *)-1;
  521. int fd;
  522. char filename[64];
  523. if (!hugetlb_pid) hugetlb_pid = getpid();
  524. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  525. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  526. return (void *)-1;
  527. }
  528. unlink(filename);
  529. map_address = mmap(address, BUFFER_SIZE,
  530. PROT_READ | PROT_WRITE,
  531. MAP_SHARED,
  532. fd, 0);
  533. if (map_address != (void *)-1) {
  534. release_info[release_pos].address = map_address;
  535. release_info[release_pos].attr = fd;
  536. release_info[release_pos].func = alloc_hugetlbfile_free;
  537. release_pos ++;
  538. }
  539. return map_address;
  540. }
  541. #endif
  542. /* Global lock for memory allocation */
  543. #if defined(USE_PTHREAD_LOCK)
  544. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  545. #elif defined(USE_PTHREAD_SPINLOCK)
  546. static pthread_spinlock_t alloc_lock = 0;
  547. #else
  548. static BLASULONG alloc_lock = 0UL;
  549. #endif
  550. #ifdef SEEK_ADDRESS
  551. static BLASULONG base_address = 0UL;
  552. #else
  553. static BLASULONG base_address = BASE_ADDRESS;
  554. #endif
  555. static volatile struct {
  556. BLASULONG lock;
  557. void *addr;
  558. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  559. int pos;
  560. #endif
  561. int used;
  562. #ifndef __64BIT__
  563. char dummy[48];
  564. #else
  565. char dummy[40];
  566. #endif
  567. } memory[NUM_BUFFERS];
  568. static int memory_initialized = 0;
  569. static void gotoblas_memory_init(void);
  570. /* Memory allocation routine */
  571. /* procpos ... indicates where it comes from */
  572. /* 0 : Level 3 functions */
  573. /* 1 : Level 2 functions */
  574. /* 2 : Thread */
  575. void *blas_memory_alloc(int procpos){
  576. int position;
  577. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  578. int mypos;
  579. #endif
  580. void *map_address;
  581. void *(*memoryalloc[])(void *address) = {
  582. #ifdef ALLOC_DEVICEDRIVER
  583. alloc_devicedirver,
  584. #endif
  585. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  586. alloc_hugetlb,
  587. #endif
  588. #ifdef ALLOC_SHM
  589. alloc_shm,
  590. #endif
  591. #ifdef ALLOC_MMAP
  592. alloc_mmap,
  593. #endif
  594. #ifdef ALLOC_QALLOC
  595. alloc_qalloc,
  596. #endif
  597. #ifdef ALLOC_WINDOWS
  598. alloc_windows,
  599. #endif
  600. #ifdef ALLOC_MALLOC
  601. alloc_malloc,
  602. #endif
  603. NULL,
  604. };
  605. void *(**func)(void *address);
  606. if (!memory_initialized) {
  607. LOCK_COMMAND(&alloc_lock);
  608. if (!memory_initialized) {
  609. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  610. for (position = 0; position < NUM_BUFFERS; position ++){
  611. memory[position].addr = (void *)0;
  612. memory[position].pos = -1;
  613. memory[position].used = 0;
  614. memory[position].lock = 0;
  615. }
  616. #endif
  617. #ifdef DYNAMIC_ARCH
  618. gotoblas_dynamic_init();
  619. #endif
  620. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  621. gotoblas_affinity_init();
  622. #endif
  623. #ifdef SMP
  624. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  625. #endif
  626. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
  627. #ifndef DYNAMIC_ARCH
  628. blas_set_parameter();
  629. #endif
  630. #endif
  631. memory_initialized = 1;
  632. }
  633. UNLOCK_COMMAND(&alloc_lock);
  634. }
  635. #ifdef DEBUG
  636. printf("Alloc Start ...\n");
  637. #endif
  638. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  639. mypos = WhereAmI();
  640. position = mypos;
  641. while (position > NUM_BUFFERS) position >>= 1;
  642. do {
  643. if (!memory[position].used && (memory[position].pos == mypos)) {
  644. blas_lock(&memory[position].lock);
  645. if (!memory[position].used) goto allocation;
  646. blas_unlock(&memory[position].lock);
  647. }
  648. position ++;
  649. } while (position < NUM_BUFFERS);
  650. #endif
  651. position = 0;
  652. do {
  653. if (!memory[position].used) {
  654. blas_lock(&memory[position].lock);
  655. if (!memory[position].used) goto allocation;
  656. blas_unlock(&memory[position].lock);
  657. }
  658. position ++;
  659. } while (position < NUM_BUFFERS);
  660. goto error;
  661. allocation :
  662. #ifdef DEBUG
  663. printf(" Position -> %d\n", position);
  664. #endif
  665. memory[position].used = 1;
  666. blas_unlock(&memory[position].lock);
  667. if (!memory[position].addr) {
  668. do {
  669. #ifdef DEBUG
  670. printf("Allocation Start : %lx\n", base_address);
  671. #endif
  672. map_address = (void *)-1;
  673. func = &memoryalloc[0];
  674. while ((func != NULL) && (map_address == (void *) -1)) {
  675. map_address = (*func)((void *)base_address);
  676. #ifdef ALLOC_DEVICEDRIVER
  677. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  678. fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
  679. }
  680. #endif
  681. #ifdef ALLOC_HUGETLBFILE
  682. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  683. #ifndef OS_WINDOWS
  684. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  685. #endif
  686. }
  687. #endif
  688. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  689. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  690. #endif
  691. func ++;
  692. }
  693. #ifdef DEBUG
  694. printf(" Success -> %08lx\n", map_address);
  695. #endif
  696. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  697. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  698. } while ((BLASLONG)map_address == -1);
  699. memory[position].addr = map_address;
  700. #ifdef DEBUG
  701. printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_area[position], position);
  702. #endif
  703. }
  704. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  705. if (memory[position].pos == -1) memory[position].pos = mypos;
  706. #endif
  707. #ifdef DYNAMIC_ARCH
  708. if (memory_initialized == 1) {
  709. LOCK_COMMAND(&alloc_lock);
  710. if (memory_initialized == 1) {
  711. if (!gotoblas) gotoblas_dynamic_init();
  712. memory_initialized = 2;
  713. }
  714. UNLOCK_COMMAND(&alloc_lock);
  715. }
  716. #endif
  717. #ifdef DEBUG
  718. printf("Mapped : %p %3d\n\n",
  719. (void *)alloc_area[position], position);
  720. #endif
  721. return (void *)memory[position].addr;
  722. error:
  723. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  724. return NULL;
  725. }
  726. void blas_memory_free(void *free_area){
  727. int position;
  728. #ifdef DEBUG
  729. printf("Unmapped Start : %p ...\n", free_area);
  730. #endif
  731. position = 0;
  732. while ((memory[position].addr != free_area)
  733. && (position < NUM_BUFFERS)) position++;
  734. if (memory[position].addr != free_area) goto error;
  735. #ifdef DEBUG
  736. printf(" Position : %d\n", position);
  737. #endif
  738. memory[position].used = 0;
  739. #ifdef DEBUG
  740. printf("Unmap Succeeded.\n\n");
  741. #endif
  742. return;
  743. error:
  744. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  745. #ifdef DEBUG
  746. for (position = 0; position < NUM_BUFFERS; position++)
  747. printf("%4ld %p : %d\n", position, alloc_area[position], alloc_used[position]);
  748. #endif
  749. return;
  750. }
  751. void blas_shutdown(void){
  752. int pos;
  753. #ifdef SMP
  754. BLASFUNC(blas_thread_shutdown)();
  755. #endif
  756. LOCK_COMMAND(&alloc_lock);
  757. for (pos = 0; pos < release_pos; pos ++) {
  758. release_info[pos].func(&release_info[pos]);
  759. }
  760. #ifdef SEEK_ADDRESS
  761. base_address = 0UL;
  762. #else
  763. base_address = BASE_ADDRESS;
  764. #endif
  765. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  766. memory[pos].addr = (void *)0;
  767. memory[pos].used = 0;
  768. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  769. memory[pos].pos = -1;
  770. #endif
  771. memory[pos].lock = 0;
  772. }
  773. UNLOCK_COMMAND(&alloc_lock);
  774. return;
  775. }
  776. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  777. #ifdef SMP
  778. #if defined(USE_PTHREAD_LOCK)
  779. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  780. #elif defined(USE_PTHREAD_SPINLOCK)
  781. static pthread_spinlock_t init_lock = 0;
  782. #else
  783. static BLASULONG init_lock = 0UL;
  784. #endif
  785. #endif
  786. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  787. void *sa, void *sb, BLASLONG pos) {
  788. #ifndef ARCH_POWER
  789. long size;
  790. BLASULONG buffer;
  791. size = BUFFER_SIZE - PAGESIZE;
  792. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  793. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  794. if (hot_alloc != 2) {
  795. #endif
  796. #ifdef SMP
  797. LOCK_COMMAND(&init_lock);
  798. #endif
  799. while (size > 0) {
  800. *(int *)buffer = size;
  801. buffer += PAGESIZE;
  802. size -= PAGESIZE;
  803. }
  804. #ifdef SMP
  805. UNLOCK_COMMAND(&init_lock);
  806. #endif
  807. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  808. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  809. while (size > 0) {
  810. *(int *)buffer = size;
  811. buffer += 64;
  812. size -= 64;
  813. }
  814. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  815. }
  816. #endif
  817. #endif
  818. }
  819. #ifdef SMP
  820. static void _init_thread_memory(void *buffer) {
  821. blas_queue_t queue[MAX_CPU_NUMBER];
  822. int num_cpu;
  823. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  824. blas_queue_init(&queue[num_cpu]);
  825. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  826. queue[num_cpu].routine = &_touch_memory;
  827. queue[num_cpu].args = NULL;
  828. queue[num_cpu].next = &queue[num_cpu + 1];
  829. }
  830. queue[num_cpu - 1].next = NULL;
  831. queue[0].sa = buffer;
  832. exec_blas(num_cpu, queue);
  833. }
  834. #endif
  835. static void gotoblas_memory_init(void) {
  836. void *buffer;
  837. hot_alloc = 1;
  838. buffer = (void *)blas_memory_alloc(0);
  839. #ifdef SMP
  840. if (blas_cpu_number == 0) blas_get_cpu_number();
  841. #ifdef SMP_SERVER
  842. if (blas_server_avail == 0) blas_thread_init();
  843. #endif
  844. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  845. #else
  846. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  847. #endif
  848. blas_memory_free(buffer);
  849. }
  850. #endif
  851. /* Initialization for all function; this function should be called before main */
  852. static int gotoblas_initialized = 0;
  853. void CONSTRUCTOR gotoblas_init(void) {
  854. if (gotoblas_initialized) return;
  855. #ifdef PROFILE
  856. moncontrol (0);
  857. #endif
  858. #ifdef DYNAMIC_ARCH
  859. gotoblas_dynamic_init();
  860. #endif
  861. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  862. gotoblas_affinity_init();
  863. #endif
  864. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  865. gotoblas_memory_init();
  866. #endif
  867. #ifdef SMP
  868. if (blas_cpu_number == 0) blas_get_cpu_number();
  869. #ifdef SMP_SERVER
  870. if (blas_server_avail == 0) blas_thread_init();
  871. #endif
  872. #endif
  873. #ifdef FUNCTION_PROFILE
  874. gotoblas_profile_init();
  875. #endif
  876. gotoblas_initialized = 1;
  877. #ifdef PROFILE
  878. moncontrol (1);
  879. #endif
  880. }
  881. void DESTRUCTOR gotoblas_quit(void) {
  882. if (gotoblas_initialized == 0) return;
  883. #ifdef PROFILE
  884. moncontrol (0);
  885. #endif
  886. #ifdef FUNCTION_PROFILE
  887. gotoblas_profile_quit();
  888. #endif
  889. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  890. gotoblas_affinity_quit();
  891. #endif
  892. #ifdef DYNAMIC_ARCH
  893. gotoblas_dynamic_quit();
  894. #endif
  895. gotoblas_initialized = 0;
  896. #ifdef PROFILE
  897. moncontrol (1);
  898. #endif
  899. }
  900. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  901. /* Don't call me; this is just work around for PGI / Sun bug */
  902. void gotoblas_dummy_for_PGI(void) {
  903. gotoblas_init();
  904. gotoblas_quit();
  905. #if 0
  906. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  907. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  908. #else
  909. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  910. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  911. #endif
  912. }
  913. #endif

OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.

Contributors (1)