You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 37 kB

7 years ago
7 years ago
9 years ago
9 years ago
9 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. //#undef DEBUG
  66. #include "common.h"
  67. #include <errno.h>
  68. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  69. #define ALLOC_WINDOWS
  70. #ifndef MEM_LARGE_PAGES
  71. #define MEM_LARGE_PAGES 0x20000000
  72. #endif
  73. #else
  74. #define ALLOC_MMAP
  75. #define ALLOC_MALLOC
  76. #endif
  77. #include <stdlib.h>
  78. #include <stdio.h>
  79. #include <fcntl.h>
  80. #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
  81. #include <sys/mman.h>
  82. #ifndef NO_SYSV_IPC
  83. #include <sys/shm.h>
  84. #endif
  85. #include <sys/ipc.h>
  86. #endif
  87. #include <sys/types.h>
  88. #ifdef OS_LINUX
  89. #include <sys/sysinfo.h>
  90. #include <sched.h>
  91. #include <errno.h>
  92. #include <linux/unistd.h>
  93. #include <sys/syscall.h>
  94. #include <sys/time.h>
  95. #include <sys/resource.h>
  96. #endif
  97. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
  98. #include <sys/sysctl.h>
  99. #include <sys/resource.h>
  100. #endif
  101. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  102. #include <conio.h>
  103. #undef printf
  104. #define printf _cprintf
  105. #endif
  106. #ifdef OS_LINUX
  107. #ifndef MPOL_PREFERRED
  108. #define MPOL_PREFERRED 1
  109. #endif
  110. #endif
  111. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  112. #define NO_WARMUP
  113. #endif
  114. #ifndef SHM_HUGETLB
  115. #define SHM_HUGETLB 04000
  116. #endif
  117. #ifndef FIXED_PAGESIZE
  118. #define FIXED_PAGESIZE 4096
  119. #endif
  120. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  121. #if defined(_MSC_VER) && !defined(__clang__)
  122. #define CONSTRUCTOR __cdecl
  123. #define DESTRUCTOR __cdecl
  124. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
  125. #define CONSTRUCTOR __attribute__ ((constructor))
  126. #define DESTRUCTOR __attribute__ ((destructor))
  127. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
  128. #define CONSTRUCTOR __attribute__ ((constructor(101)))
  129. #define DESTRUCTOR __attribute__ ((destructor(101)))
  130. #else
  131. #define CONSTRUCTOR __attribute__ ((constructor))
  132. #define DESTRUCTOR __attribute__ ((destructor))
  133. #endif
  134. #ifdef DYNAMIC_ARCH
  135. gotoblas_t *gotoblas = NULL;
  136. #endif
  137. extern void openblas_warning(int verbose, const char * msg);
  138. #ifndef SMP
  139. #define blas_cpu_number 1
  140. #define blas_num_threads 1
  141. /* Dummy Function */
  142. int goto_get_num_procs (void) { return 1;};
  143. void goto_set_num_threads(int num_threads) {};
  144. #else
  145. #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
  146. #ifndef NO_AFFINITY
  147. int get_num_procs(void);
  148. #else
  149. int get_num_procs(void) {
  150. static int nums = 0;
  151. cpu_set_t *cpusetp;
  152. size_t size;
  153. int ret;
  154. // int i,n;
  155. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  156. #if !defined(OS_LINUX)
  157. return nums;
  158. #endif
  159. #if !defined(__GLIBC_PREREQ)
  160. return nums;
  161. #else
  162. #if !__GLIBC_PREREQ(2, 3)
  163. return nums;
  164. #endif
  165. #if !__GLIBC_PREREQ(2, 7)
  166. ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
  167. if (ret!=0) return nums;
  168. n=0;
  169. #if !__GLIBC_PREREQ(2, 6)
  170. for (i=0;i<nums;i++)
  171. if (CPU_ISSET(i,cpusetp)) n++;
  172. nums=n;
  173. #else
  174. nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
  175. #endif
  176. return nums;
  177. #else
  178. cpusetp = CPU_ALLOC(nums);
  179. if (cpusetp == NULL) return nums;
  180. size = CPU_ALLOC_SIZE(nums);
  181. ret = sched_getaffinity(0,size,cpusetp);
  182. if (ret!=0) return nums;
  183. nums = CPU_COUNT_S(size,cpusetp);
  184. CPU_FREE(cpusetp);
  185. return nums;
  186. #endif
  187. #endif
  188. }
  189. #endif
  190. #endif
  191. #ifdef OS_ANDROID
  192. int get_num_procs(void) {
  193. static int nums = 0;
  194. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  195. return nums;
  196. }
  197. #endif
  198. #ifdef OS_WINDOWS
  199. int get_num_procs(void) {
  200. static int nums = 0;
  201. if (nums == 0) {
  202. SYSTEM_INFO sysinfo;
  203. GetSystemInfo(&sysinfo);
  204. nums = sysinfo.dwNumberOfProcessors;
  205. }
  206. return nums;
  207. }
  208. #endif
  209. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
  210. int get_num_procs(void) {
  211. static int nums = 0;
  212. int m[2];
  213. size_t len;
  214. if (nums == 0) {
  215. m[0] = CTL_HW;
  216. m[1] = HW_NCPU;
  217. len = sizeof(int);
  218. sysctl(m, 2, &nums, &len, NULL, 0);
  219. }
  220. return nums;
  221. }
  222. #endif
  223. #if defined(OS_DARWIN)
  224. int get_num_procs(void) {
  225. static int nums = 0;
  226. size_t len;
  227. if (nums == 0){
  228. len = sizeof(int);
  229. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  230. }
  231. return nums;
  232. }
  233. /*
  234. void set_stack_limit(int limitMB){
  235. int result=0;
  236. struct rlimit rl;
  237. rlim_t StackSize;
  238. StackSize=limitMB*1024*1024;
  239. result=getrlimit(RLIMIT_STACK, &rl);
  240. if(result==0){
  241. if(rl.rlim_cur < StackSize){
  242. rl.rlim_cur=StackSize;
  243. result=setrlimit(RLIMIT_STACK, &rl);
  244. if(result !=0){
  245. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  246. }
  247. }
  248. }
  249. }
  250. */
  251. #endif
  252. /*
  253. OpenBLAS uses the numbers of CPU cores in multithreading.
  254. It can be set by openblas_set_num_threads(int num_threads);
  255. */
  256. int blas_cpu_number = 0;
  257. /*
  258. The numbers of threads in the thread pool.
  259. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  260. */
  261. int blas_num_threads = 0;
  262. int goto_get_num_procs (void) {
  263. return blas_cpu_number;
  264. }
  265. void openblas_fork_handler()
  266. {
  267. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  268. // built with "make USE_OPENMP=0".
  269. // Hanging can still happen when OpenBLAS is built against the libgomp
  270. // implementation of OpenMP. The problem is tracked at:
  271. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  272. // In the mean time build with USE_OPENMP=0 or link against another
  273. // implementation of OpenMP.
  274. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
  275. int err;
  276. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
  277. if(err != 0)
  278. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  279. #endif
  280. }
  281. extern int openblas_num_threads_env();
  282. extern int openblas_goto_num_threads_env();
  283. extern int openblas_omp_num_threads_env();
  284. int blas_get_cpu_number(void){
  285. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  286. int max_num;
  287. #endif
  288. int blas_goto_num = 0;
  289. int blas_omp_num = 0;
  290. if (blas_num_threads) return blas_num_threads;
  291. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  292. max_num = get_num_procs();
  293. #endif
  294. // blas_goto_num = 0;
  295. #ifndef USE_OPENMP
  296. blas_goto_num=openblas_num_threads_env();
  297. if (blas_goto_num < 0) blas_goto_num = 0;
  298. if (blas_goto_num == 0) {
  299. blas_goto_num=openblas_goto_num_threads_env();
  300. if (blas_goto_num < 0) blas_goto_num = 0;
  301. }
  302. #endif
  303. // blas_omp_num = 0;
  304. blas_omp_num=openblas_omp_num_threads_env();
  305. if (blas_omp_num < 0) blas_omp_num = 0;
  306. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  307. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  308. else blas_num_threads = MAX_CPU_NUMBER;
  309. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
  310. if (blas_num_threads > max_num) blas_num_threads = max_num;
  311. #endif
  312. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  313. #ifdef DEBUG
  314. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  315. #endif
  316. blas_cpu_number = blas_num_threads;
  317. return blas_num_threads;
  318. }
  319. #endif
  320. int openblas_get_num_procs(void) {
  321. #ifndef SMP
  322. return 1;
  323. #else
  324. return get_num_procs();
  325. #endif
  326. }
  327. int openblas_get_num_threads(void) {
  328. #ifndef SMP
  329. return 1;
  330. #else
  331. // init blas_cpu_number if needed
  332. blas_get_cpu_number();
  333. return blas_cpu_number;
  334. #endif
  335. }
  336. struct release_t {
  337. void *address;
  338. void (*func)(struct release_t *);
  339. long attr;
  340. };
  341. int hugetlb_allocated = 0;
  342. static struct release_t release_info[NUM_BUFFERS];
  343. static int release_pos = 0;
  344. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  345. static int hot_alloc = 0;
  346. #endif
  347. /* Global lock for memory allocation */
  348. #if defined(USE_PTHREAD_LOCK)
  349. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  350. #elif defined(USE_PTHREAD_SPINLOCK)
  351. static pthread_spinlock_t alloc_lock = 0;
  352. #else
  353. static BLASULONG alloc_lock = 0UL;
  354. #endif
  355. #ifdef ALLOC_MMAP
  356. static void alloc_mmap_free(struct release_t *release){
  357. if (munmap(release -> address, BUFFER_SIZE)) {
  358. printf("OpenBLAS : munmap failed\n");
  359. }
  360. }
  361. #ifdef NO_WARMUP
  362. static void *alloc_mmap(void *address){
  363. void *map_address;
  364. if (address){
  365. map_address = mmap(address,
  366. BUFFER_SIZE,
  367. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  368. } else {
  369. map_address = mmap(address,
  370. BUFFER_SIZE,
  371. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  372. }
  373. if (map_address != (void *)-1) {
  374. #if defined(SMP) && !defined(USE_OPENMP)
  375. LOCK_COMMAND(&alloc_lock);
  376. #endif
  377. release_info[release_pos].address = map_address;
  378. release_info[release_pos].func = alloc_mmap_free;
  379. release_pos ++;
  380. #if defined(SMP) && !defined(USE_OPENMP)
  381. UNLOCK_COMMAND(&alloc_lock);
  382. #endif
  383. }
  384. #ifdef OS_LINUX
  385. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  386. #endif
  387. return map_address;
  388. }
  389. #else
  390. #define BENCH_ITERATION 4
  391. #define SCALING 2
  392. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  393. BLASULONG original, *p;
  394. BLASULONG start, stop, min;
  395. int iter, i, count;
  396. min = (BLASULONG)-1;
  397. original = *(BLASULONG *)(address + size - PAGESIZE);
  398. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  399. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  400. p = (BLASULONG *)address;
  401. count = size / PAGESIZE;
  402. start = rpcc();
  403. for (i = 0; i < count; i ++) {
  404. p = (BLASULONG *)(*p);
  405. }
  406. stop = rpcc();
  407. if (min > stop - start) min = stop - start;
  408. }
  409. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  410. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  411. return min;
  412. }
  413. static void *alloc_mmap(void *address){
  414. void *map_address, *best_address;
  415. BLASULONG best, start, current;
  416. BLASULONG allocsize;
  417. if (address){
  418. /* Just give up use advanced operation */
  419. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  420. #ifdef OS_LINUX
  421. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  422. #endif
  423. } else {
  424. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  425. if (hot_alloc == 0) {
  426. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  427. #ifdef OS_LINUX
  428. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  429. #endif
  430. } else {
  431. #endif
  432. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  433. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  434. if (map_address != (void *)-1) {
  435. #ifdef OS_LINUX
  436. #ifdef DEBUG
  437. int ret=0;
  438. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  439. if(ret==-1){
  440. int errsv=errno;
  441. perror("OpenBLAS alloc_mmap:");
  442. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  443. }
  444. #else
  445. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  446. #endif
  447. #endif
  448. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  449. start = (BLASULONG)map_address;
  450. current = (SCALING - 1) * BUFFER_SIZE;
  451. while(current > 0) {
  452. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  453. start += PAGESIZE;
  454. current -= PAGESIZE;
  455. }
  456. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  457. start = (BLASULONG)map_address;
  458. best = (BLASULONG)-1;
  459. best_address = map_address;
  460. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  461. current = run_bench(start, allocsize);
  462. if (best > current) {
  463. best = current;
  464. best_address = (void *)start;
  465. }
  466. start += PAGESIZE;
  467. }
  468. if ((BLASULONG)best_address > (BLASULONG)map_address)
  469. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  470. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  471. map_address = best_address;
  472. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  473. hot_alloc = 2;
  474. #endif
  475. }
  476. }
  477. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  478. }
  479. #endif
  480. if (map_address != (void *)-1) {
  481. #if defined(SMP) && !defined(USE_OPENMP)
  482. LOCK_COMMAND(&alloc_lock);
  483. #endif
  484. release_info[release_pos].address = map_address;
  485. release_info[release_pos].func = alloc_mmap_free;
  486. release_pos ++;
  487. #if defined(SMP) && !defined(USE_OPENMP)
  488. UNLOCK_COMMAND(&alloc_lock);
  489. #endif
  490. }
  491. return map_address;
  492. }
  493. #endif
  494. #endif
  495. #ifdef ALLOC_MALLOC
  496. static void alloc_malloc_free(struct release_t *release){
  497. free(release -> address);
  498. }
  499. static void *alloc_malloc(void *address){
  500. void *map_address;
  501. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  502. if (map_address == (void *)NULL) map_address = (void *)-1;
  503. if (map_address != (void *)-1) {
  504. release_info[release_pos].address = map_address;
  505. release_info[release_pos].func = alloc_malloc_free;
  506. release_pos ++;
  507. }
  508. return map_address;
  509. }
  510. #endif
  511. #ifdef ALLOC_QALLOC
  512. void *qalloc(int flags, size_t bytes);
  513. void *qfree (void *address);
  514. #define QNONCACHE 0x1
  515. #define QCOMMS 0x2
  516. #define QFAST 0x4
  517. static void alloc_qalloc_free(struct release_t *release){
  518. qfree(release -> address);
  519. }
  520. static void *alloc_qalloc(void *address){
  521. void *map_address;
  522. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  523. if (map_address == (void *)NULL) map_address = (void *)-1;
  524. if (map_address != (void *)-1) {
  525. release_info[release_pos].address = map_address;
  526. release_info[release_pos].func = alloc_qalloc_free;
  527. release_pos ++;
  528. }
  529. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  530. }
  531. #endif
  532. #ifdef ALLOC_WINDOWS
  533. static void alloc_windows_free(struct release_t *release){
  534. VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
  535. }
  536. static void *alloc_windows(void *address){
  537. void *map_address;
  538. map_address = VirtualAlloc(address,
  539. BUFFER_SIZE,
  540. MEM_RESERVE | MEM_COMMIT,
  541. PAGE_READWRITE);
  542. if (map_address == (void *)NULL) map_address = (void *)-1;
  543. if (map_address != (void *)-1) {
  544. release_info[release_pos].address = map_address;
  545. release_info[release_pos].func = alloc_windows_free;
  546. release_pos ++;
  547. }
  548. return map_address;
  549. }
  550. #endif
  551. #ifdef ALLOC_DEVICEDRIVER
  552. #ifndef DEVICEDRIVER_NAME
  553. #define DEVICEDRIVER_NAME "/dev/mapper"
  554. #endif
  555. static void alloc_devicedirver_free(struct release_t *release){
  556. if (munmap(release -> address, BUFFER_SIZE)) {
  557. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  558. }
  559. if (close(release -> attr)) {
  560. printf("OpenBLAS : Bugphysarea close failed.\n");
  561. }
  562. }
  563. static void *alloc_devicedirver(void *address){
  564. int fd;
  565. void *map_address;
  566. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  567. return (void *)-1;
  568. }
  569. map_address = mmap(address, BUFFER_SIZE,
  570. PROT_READ | PROT_WRITE,
  571. MAP_FILE | MAP_SHARED,
  572. fd, 0);
  573. if (map_address != (void *)-1) {
  574. release_info[release_pos].address = map_address;
  575. release_info[release_pos].attr = fd;
  576. release_info[release_pos].func = alloc_devicedirver_free;
  577. release_pos ++;
  578. }
  579. return map_address;
  580. }
  581. #endif
  582. #ifdef ALLOC_SHM
  583. static void alloc_shm_free(struct release_t *release){
  584. if (shmdt(release -> address)) {
  585. printf("OpenBLAS : Shared memory unmap failed.\n");
  586. }
  587. }
  588. static void *alloc_shm(void *address){
  589. void *map_address;
  590. int shmid;
  591. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  592. map_address = (void *)shmat(shmid, address, 0);
  593. if (map_address != (void *)-1){
  594. #ifdef OS_LINUX
  595. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  596. #endif
  597. shmctl(shmid, IPC_RMID, 0);
  598. release_info[release_pos].address = map_address;
  599. release_info[release_pos].attr = shmid;
  600. release_info[release_pos].func = alloc_shm_free;
  601. release_pos ++;
  602. }
  603. return map_address;
  604. }
  605. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  606. static void alloc_hugetlb_free(struct release_t *release){
  607. #if defined(OS_LINUX) || defined(OS_AIX)
  608. if (shmdt(release -> address)) {
  609. printf("OpenBLAS : Hugepage unmap failed.\n");
  610. }
  611. #endif
  612. #ifdef __sun__
  613. munmap(release -> address, BUFFER_SIZE);
  614. #endif
  615. #ifdef OS_WINDOWS
  616. VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
  617. #endif
  618. }
  619. static void *alloc_hugetlb(void *address){
  620. void *map_address = (void *)-1;
  621. #if defined(OS_LINUX) || defined(OS_AIX)
  622. int shmid;
  623. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  624. #ifdef OS_LINUX
  625. SHM_HUGETLB |
  626. #endif
  627. #ifdef OS_AIX
  628. SHM_LGPAGE | SHM_PIN |
  629. #endif
  630. IPC_CREAT | SHM_R | SHM_W);
  631. if (shmid != -1) {
  632. map_address = (void *)shmat(shmid, address, SHM_RND);
  633. #ifdef OS_LINUX
  634. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  635. #endif
  636. if (map_address != (void *)-1){
  637. shmctl(shmid, IPC_RMID, 0);
  638. }
  639. }
  640. #endif
  641. #ifdef __sun__
  642. struct memcntl_mha mha;
  643. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  644. mha.mha_flags = 0;
  645. mha.mha_pagesize = HUGE_PAGESIZE;
  646. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  647. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  648. #endif
  649. #ifdef OS_WINDOWS
  650. HANDLE hToken;
  651. TOKEN_PRIVILEGES tp;
  652. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  653. tp.PrivilegeCount = 1;
  654. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  655. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  656. CloseHandle(hToken);
  657. return (void*)-1;
  658. }
  659. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  660. CloseHandle(hToken);
  661. return (void*)-1;
  662. }
  663. map_address = (void *)VirtualAlloc(address,
  664. BUFFER_SIZE,
  665. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  666. PAGE_READWRITE);
  667. tp.Privileges[0].Attributes = 0;
  668. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  669. if (map_address == (void *)NULL) map_address = (void *)-1;
  670. #endif
  671. if (map_address != (void *)-1){
  672. release_info[release_pos].address = map_address;
  673. release_info[release_pos].func = alloc_hugetlb_free;
  674. release_pos ++;
  675. }
  676. return map_address;
  677. }
  678. #endif
  679. #endif
  680. #ifdef ALLOC_HUGETLBFILE
  681. static int hugetlb_pid = 0;
  682. static void alloc_hugetlbfile_free(struct release_t *release){
  683. if (munmap(release -> address, BUFFER_SIZE)) {
  684. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  685. }
  686. if (close(release -> attr)) {
  687. printf("OpenBLAS : HugeTLBfs close failed.\n");
  688. }
  689. }
  690. static void *alloc_hugetlbfile(void *address){
  691. void *map_address = (void *)-1;
  692. int fd;
  693. char filename[64];
  694. if (!hugetlb_pid) hugetlb_pid = getpid();
  695. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  696. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  697. return (void *)-1;
  698. }
  699. unlink(filename);
  700. map_address = mmap(address, BUFFER_SIZE,
  701. PROT_READ | PROT_WRITE,
  702. MAP_SHARED,
  703. fd, 0);
  704. if (map_address != (void *)-1) {
  705. release_info[release_pos].address = map_address;
  706. release_info[release_pos].attr = fd;
  707. release_info[release_pos].func = alloc_hugetlbfile_free;
  708. release_pos ++;
  709. }
  710. return map_address;
  711. }
  712. #endif
  713. #ifdef SEEK_ADDRESS
  714. static BLASULONG base_address = 0UL;
  715. #else
  716. static BLASULONG base_address = BASE_ADDRESS;
  717. #endif
  718. static volatile struct {
  719. BLASULONG lock;
  720. void *addr;
  721. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  722. int pos;
  723. #endif
  724. int used;
  725. #ifndef __64BIT__
  726. char dummy[48];
  727. #else
  728. char dummy[40];
  729. #endif
  730. } memory[NUM_BUFFERS];
  731. static int memory_initialized = 0;
  732. /* Memory allocation routine */
  733. /* procpos ... indicates where it comes from */
  734. /* 0 : Level 3 functions */
  735. /* 1 : Level 2 functions */
  736. /* 2 : Thread */
  737. void *blas_memory_alloc(int procpos){
  738. int position;
  739. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  740. int mypos;
  741. #endif
  742. void *map_address;
  743. void *(*memoryalloc[])(void *address) = {
  744. #ifdef ALLOC_DEVICEDRIVER
  745. alloc_devicedirver,
  746. #endif
  747. /* Hugetlb implicitly assumes ALLOC_SHM */
  748. #ifdef ALLOC_SHM
  749. alloc_shm,
  750. #endif
  751. #if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  752. alloc_hugetlb,
  753. #endif
  754. #ifdef ALLOC_MMAP
  755. alloc_mmap,
  756. #endif
  757. #ifdef ALLOC_QALLOC
  758. alloc_qalloc,
  759. #endif
  760. #ifdef ALLOC_WINDOWS
  761. alloc_windows,
  762. #endif
  763. #ifdef ALLOC_MALLOC
  764. alloc_malloc,
  765. #endif
  766. NULL,
  767. };
  768. void *(**func)(void *address);
  769. #if defined(USE_OPENMP)
  770. if (!memory_initialized) {
  771. #endif
  772. LOCK_COMMAND(&alloc_lock);
  773. if (!memory_initialized) {
  774. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  775. for (position = 0; position < NUM_BUFFERS; position ++){
  776. memory[position].addr = (void *)0;
  777. memory[position].pos = -1;
  778. memory[position].used = 0;
  779. memory[position].lock = 0;
  780. }
  781. #endif
  782. #ifdef DYNAMIC_ARCH
  783. gotoblas_dynamic_init();
  784. #endif
  785. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  786. gotoblas_affinity_init();
  787. #endif
  788. #ifdef SMP
  789. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  790. #endif
  791. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
  792. #ifndef DYNAMIC_ARCH
  793. blas_set_parameter();
  794. #endif
  795. #endif
  796. memory_initialized = 1;
  797. }
  798. UNLOCK_COMMAND(&alloc_lock);
  799. #if defined(USE_OPENMP)
  800. }
  801. #endif
  802. #ifdef DEBUG
  803. printf("Alloc Start ...\n");
  804. #endif
  805. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  806. mypos = WhereAmI();
  807. position = mypos;
  808. while (position >= NUM_BUFFERS) position >>= 1;
  809. do {
  810. if (!memory[position].used && (memory[position].pos == mypos)) {
  811. #if defined(SMP) && !defined(USE_OPENMP)
  812. LOCK_COMMAND(&alloc_lock);
  813. #else
  814. blas_lock(&memory[position].lock);
  815. #endif
  816. if (!memory[position].used) goto allocation;
  817. #if defined(SMP) && !defined(USE_OPENMP)
  818. UNLOCK_COMMAND(&alloc_lock);
  819. #else
  820. blas_unlock(&memory[position].lock);
  821. #endif
  822. }
  823. position ++;
  824. } while (position < NUM_BUFFERS);
  825. #endif
  826. position = 0;
  827. do {
  828. #if defined(SMP) && !defined(USE_OPENMP)
  829. LOCK_COMMAND(&alloc_lock);
  830. #else
  831. if (!memory[position].used) {
  832. blas_lock(&memory[position].lock);
  833. #endif
  834. if (!memory[position].used) goto allocation;
  835. #if defined(SMP) && !defined(USE_OPENMP)
  836. UNLOCK_COMMAND(&alloc_lock);
  837. #else
  838. blas_unlock(&memory[position].lock);
  839. }
  840. #endif
  841. position ++;
  842. } while (position < NUM_BUFFERS);
  843. goto error;
  844. allocation :
  845. #ifdef DEBUG
  846. printf(" Position -> %d\n", position);
  847. #endif
  848. memory[position].used = 1;
  849. #if defined(SMP) && !defined(USE_OPENMP)
  850. UNLOCK_COMMAND(&alloc_lock);
  851. #else
  852. blas_unlock(&memory[position].lock);
  853. #endif
  854. if (!memory[position].addr) {
  855. do {
  856. #ifdef DEBUG
  857. printf("Allocation Start : %lx\n", base_address);
  858. #endif
  859. map_address = (void *)-1;
  860. func = &memoryalloc[0];
  861. while ((func != NULL) && (map_address == (void *) -1)) {
  862. map_address = (*func)((void *)base_address);
  863. #ifdef ALLOC_DEVICEDRIVER
  864. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  865. fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
  866. }
  867. #endif
  868. #ifdef ALLOC_HUGETLBFILE
  869. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  870. #ifndef OS_WINDOWS
  871. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  872. #endif
  873. }
  874. #endif
  875. #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  876. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  877. #endif
  878. func ++;
  879. }
  880. #ifdef DEBUG
  881. printf(" Success -> %08lx\n", map_address);
  882. #endif
  883. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  884. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  885. } while ((BLASLONG)map_address == -1);
  886. #if defined(SMP) && !defined(USE_OPENMP)
  887. LOCK_COMMAND(&alloc_lock);
  888. #endif
  889. memory[position].addr = map_address;
  890. #if defined(SMP) && !defined(USE_OPENMP)
  891. UNLOCK_COMMAND(&alloc_lock);
  892. #endif
  893. #ifdef DEBUG
  894. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  895. #endif
  896. }
  897. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  898. if (memory[position].pos == -1) memory[position].pos = mypos;
  899. #endif
  900. #ifdef DYNAMIC_ARCH
  901. if (memory_initialized == 1) {
  902. LOCK_COMMAND(&alloc_lock);
  903. if (memory_initialized == 1) {
  904. if (!gotoblas) gotoblas_dynamic_init();
  905. memory_initialized = 2;
  906. }
  907. UNLOCK_COMMAND(&alloc_lock);
  908. }
  909. #endif
  910. #ifdef DEBUG
  911. printf("Mapped : %p %3d\n\n",
  912. (void *)memory[position].addr, position);
  913. #endif
  914. return (void *)memory[position].addr;
  915. error:
  916. printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  917. return NULL;
  918. }
  919. void blas_memory_free(void *free_area){
  920. int position;
  921. #ifdef DEBUG
  922. printf("Unmapped Start : %p ...\n", free_area);
  923. #endif
  924. position = 0;
  925. #if defined(SMP) && !defined(USE_OPENMP)
  926. LOCK_COMMAND(&alloc_lock);
  927. #endif
  928. while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
  929. position++;
  930. if (memory[position].addr != free_area) goto error;
  931. #ifdef DEBUG
  932. printf(" Position : %d\n", position);
  933. #endif
  934. // arm: ensure all writes are finished before other thread takes this memory
  935. WMB;
  936. memory[position].used = 0;
  937. #if defined(SMP) && !defined(USE_OPENMP)
  938. UNLOCK_COMMAND(&alloc_lock);
  939. #endif
  940. #ifdef DEBUG
  941. printf("Unmap Succeeded.\n\n");
  942. #endif
  943. return;
  944. error:
  945. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  946. #ifdef DEBUG
  947. for (position = 0; position < NUM_BUFFERS; position++)
  948. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  949. #endif
  950. #if defined(SMP) && !defined(USE_OPENMP)
  951. UNLOCK_COMMAND(&alloc_lock);
  952. #endif
  953. return;
  954. }
  955. void *blas_memory_alloc_nolock(int unused) {
  956. void *map_address;
  957. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  958. return map_address;
  959. }
  960. void blas_memory_free_nolock(void * map_address) {
  961. free(map_address);
  962. }
  963. void blas_shutdown(void){
  964. int pos;
  965. #ifdef SMP
  966. BLASFUNC(blas_thread_shutdown)();
  967. #endif
  968. LOCK_COMMAND(&alloc_lock);
  969. for (pos = 0; pos < release_pos; pos ++) {
  970. release_info[pos].func(&release_info[pos]);
  971. }
  972. #ifdef SEEK_ADDRESS
  973. base_address = 0UL;
  974. #else
  975. base_address = BASE_ADDRESS;
  976. #endif
  977. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  978. memory[pos].addr = (void *)0;
  979. memory[pos].used = 0;
  980. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  981. memory[pos].pos = -1;
  982. #endif
  983. memory[pos].lock = 0;
  984. }
  985. UNLOCK_COMMAND(&alloc_lock);
  986. return;
  987. }
  988. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  989. #ifdef SMP
  990. #if defined(USE_PTHREAD_LOCK)
  991. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  992. #elif defined(USE_PTHREAD_SPINLOCK)
  993. static pthread_spinlock_t init_lock = 0;
  994. #else
  995. static BLASULONG init_lock = 0UL;
  996. #endif
  997. #endif
  998. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  999. void *sa, void *sb, BLASLONG pos) {
  1000. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  1001. size_t size;
  1002. BLASULONG buffer;
  1003. size = BUFFER_SIZE - PAGESIZE;
  1004. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1005. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1006. if (hot_alloc != 2) {
  1007. #endif
  1008. #ifdef SMP
  1009. LOCK_COMMAND(&init_lock);
  1010. #endif
  1011. while (size > 0) {
  1012. *(int *)buffer = size;
  1013. buffer += PAGESIZE;
  1014. size -= PAGESIZE;
  1015. }
  1016. #ifdef SMP
  1017. UNLOCK_COMMAND(&init_lock);
  1018. #endif
  1019. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  1020. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1021. while (size > 0) {
  1022. *(int *)buffer = size;
  1023. buffer += 64;
  1024. size -= 64;
  1025. }
  1026. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1027. }
  1028. #endif
  1029. #endif
  1030. }
  1031. #ifdef SMP
  1032. static void _init_thread_memory(void *buffer) {
  1033. blas_queue_t queue[MAX_CPU_NUMBER];
  1034. int num_cpu;
  1035. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  1036. blas_queue_init(&queue[num_cpu]);
  1037. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  1038. queue[num_cpu].routine = &_touch_memory;
  1039. queue[num_cpu].args = NULL;
  1040. queue[num_cpu].next = &queue[num_cpu + 1];
  1041. }
  1042. queue[num_cpu - 1].next = NULL;
  1043. queue[0].sa = buffer;
  1044. exec_blas(num_cpu, queue);
  1045. }
  1046. #endif
  1047. static void gotoblas_memory_init(void) {
  1048. void *buffer;
  1049. hot_alloc = 1;
  1050. buffer = (void *)blas_memory_alloc(0);
  1051. #ifdef SMP
  1052. if (blas_cpu_number == 0) blas_get_cpu_number();
  1053. #ifdef SMP_SERVER
  1054. if (blas_server_avail == 0) blas_thread_init();
  1055. #endif
  1056. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  1057. #else
  1058. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  1059. #endif
  1060. blas_memory_free(buffer);
  1061. }
  1062. #endif
  1063. /* Initialization for all function; this function should be called before main */
  1064. static int gotoblas_initialized = 0;
  1065. extern void openblas_read_env();
  1066. void CONSTRUCTOR gotoblas_init(void) {
  1067. if (gotoblas_initialized) return;
  1068. #ifdef SMP
  1069. openblas_fork_handler();
  1070. #endif
  1071. openblas_read_env();
  1072. #ifdef PROFILE
  1073. moncontrol (0);
  1074. #endif
  1075. #ifdef DYNAMIC_ARCH
  1076. gotoblas_dynamic_init();
  1077. #endif
  1078. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1079. gotoblas_affinity_init();
  1080. #endif
  1081. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1082. gotoblas_memory_init();
  1083. #endif
  1084. //#if defined(OS_LINUX)
  1085. #if 0
  1086. struct rlimit curlimit;
  1087. if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
  1088. {
  1089. if ( curlimit.rlim_cur != curlimit.rlim_max )
  1090. {
  1091. curlimit.rlim_cur = curlimit.rlim_max;
  1092. setrlimit(RLIMIT_STACK, &curlimit);
  1093. }
  1094. }
  1095. #endif
  1096. #ifdef SMP
  1097. if (blas_cpu_number == 0) blas_get_cpu_number();
  1098. #ifdef SMP_SERVER
  1099. if (blas_server_avail == 0) blas_thread_init();
  1100. #endif
  1101. #endif
  1102. #ifdef FUNCTION_PROFILE
  1103. gotoblas_profile_init();
  1104. #endif
  1105. gotoblas_initialized = 1;
  1106. #ifdef PROFILE
  1107. moncontrol (1);
  1108. #endif
  1109. }
  1110. void DESTRUCTOR gotoblas_quit(void) {
  1111. if (gotoblas_initialized == 0) return;
  1112. blas_shutdown();
  1113. #ifdef PROFILE
  1114. moncontrol (0);
  1115. #endif
  1116. #ifdef FUNCTION_PROFILE
  1117. gotoblas_profile_quit();
  1118. #endif
  1119. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1120. gotoblas_affinity_quit();
  1121. #endif
  1122. #ifdef DYNAMIC_ARCH
  1123. gotoblas_dynamic_quit();
  1124. #endif
  1125. gotoblas_initialized = 0;
  1126. #ifdef PROFILE
  1127. moncontrol (1);
  1128. #endif
  1129. }
  1130. #if defined(_MSC_VER) && !defined(__clang__)
  1131. BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
  1132. {
  1133. switch (ul_reason_for_call)
  1134. {
  1135. case DLL_PROCESS_ATTACH:
  1136. gotoblas_init();
  1137. break;
  1138. case DLL_THREAD_ATTACH:
  1139. break;
  1140. case DLL_THREAD_DETACH:
  1141. break;
  1142. case DLL_PROCESS_DETACH:
  1143. gotoblas_quit();
  1144. break;
  1145. default:
  1146. break;
  1147. }
  1148. return TRUE;
  1149. }
  1150. /*
  1151. This is to allow static linking.
  1152. Code adapted from Google performance tools:
  1153. https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
  1154. Reference:
  1155. https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
  1156. http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
  1157. */
  1158. static int on_process_term(void)
  1159. {
  1160. gotoblas_quit();
  1161. return 0;
  1162. }
  1163. #ifdef _WIN64
  1164. #pragma comment(linker, "/INCLUDE:_tls_used")
  1165. #else
  1166. #pragma comment(linker, "/INCLUDE:__tls_used")
  1167. #endif
  1168. #ifdef _WIN64
  1169. #pragma const_seg(".CRT$XLB")
  1170. #else
  1171. #pragma data_seg(".CRT$XLB")
  1172. #endif
  1173. static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  1174. #ifdef _WIN64
  1175. #pragma const_seg()
  1176. #else
  1177. #pragma data_seg()
  1178. #endif
  1179. #ifdef _WIN64
  1180. #pragma const_seg(".CRT$XTU")
  1181. #else
  1182. #pragma data_seg(".CRT$XTU")
  1183. #endif
  1184. static int(*p_process_term)(void) = on_process_term;
  1185. #ifdef _WIN64
  1186. #pragma const_seg()
  1187. #else
  1188. #pragma data_seg()
  1189. #endif
  1190. #endif
  1191. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  1192. /* Don't call me; this is just work around for PGI / Sun bug */
  1193. void gotoblas_dummy_for_PGI(void) {
  1194. gotoblas_init();
  1195. gotoblas_quit();
  1196. #if 0
  1197. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  1198. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  1199. #else
  1200. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  1201. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  1202. #endif
  1203. }
  1204. #endif