You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

memory.c 87 kB

Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
4 years ago
4 years ago
7 years ago
7 years ago
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
Remove the need for most locking in memory.c. Using thread local storage for tracking memory allocations means that threads no longer have to lock at all when doing memory allocations / frees. This particularly helps the gemm driver since it does an allocation per invocation. Even without threading at all, this helps, since even calling a lock with no contention has a cost: Before this change, no threading: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 102 ns 102 ns 13504412 BM_SGEMM/6 175 ns 175 ns 7997580 BM_SGEMM/8 205 ns 205 ns 6842073 BM_SGEMM/10 266 ns 266 ns 5294919 BM_SGEMM/16 478 ns 478 ns 2963441 BM_SGEMM/20 690 ns 690 ns 2144755 BM_SGEMM/32 1906 ns 1906 ns 716981 BM_SGEMM/40 2983 ns 2983 ns 473218 BM_SGEMM/64 9421 ns 9422 ns 148450 BM_SGEMM/72 12630 ns 12631 ns 112105 BM_SGEMM/80 15845 ns 15846 ns 89118 BM_SGEMM/90 25675 ns 25676 ns 54332 BM_SGEMM/100 29864 ns 29865 ns 47120 BM_SGEMM/112 37841 ns 37842 ns 36717 BM_SGEMM/128 56531 ns 56532 ns 25361 BM_SGEMM/140 75886 ns 75888 ns 18143 BM_SGEMM/150 98493 ns 98496 ns 14299 BM_SGEMM/160 102620 ns 102622 ns 13381 BM_SGEMM/170 135169 ns 135173 ns 10231 BM_SGEMM/180 146170 ns 146172 ns 9535 BM_SGEMM/189 190226 ns 190231 ns 7397 BM_SGEMM/200 194513 ns 194519 ns 7210 BM_SGEMM/256 396561 ns 396573 ns 3531 ``` with this change: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 14500387 BM_SGEMM/6 166 ns 166 ns 8381763 BM_SGEMM/8 196 ns 196 ns 7277044 BM_SGEMM/10 256 ns 256 ns 5515721 BM_SGEMM/16 463 ns 463 ns 3025197 BM_SGEMM/20 636 ns 636 ns 2070213 BM_SGEMM/32 1885 ns 1885 ns 739444 BM_SGEMM/40 2969 ns 2969 ns 472152 BM_SGEMM/64 9371 ns 9372 ns 148932 BM_SGEMM/72 12431 ns 12431 ns 112919 BM_SGEMM/80 15615 ns 15616 ns 89978 BM_SGEMM/90 25397 ns 25398 ns 55041 BM_SGEMM/100 29445 ns 29446 ns 47540 BM_SGEMM/112 37530 ns 37531 ns 37286 BM_SGEMM/128 55373 ns 55375 ns 25277 BM_SGEMM/140 76241 ns 76241 ns 18259 BM_SGEMM/150 102196 ns 102200 ns 13736 BM_SGEMM/160 101521 ns 101525 ns 13556 BM_SGEMM/170 136182 ns 136184 ns 10567 BM_SGEMM/180 146861 ns 146864 ns 9035 BM_SGEMM/189 192632 ns 192632 ns 7231 BM_SGEMM/200 198547 ns 198555 ns 6995 BM_SGEMM/256 392316 ns 392330 ns 3539 ``` Before, when built with USE_THREAD=1, GEMM_MULTITHREAD_THRESHOLD = 4, the cost of small matrix operations was overshadowed by thread locking (look smaller than 32) even when not explicitly spawning threads: ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 328 ns 328 ns 4170562 BM_SGEMM/6 396 ns 396 ns 3536400 BM_SGEMM/8 418 ns 418 ns 3330102 BM_SGEMM/10 491 ns 491 ns 2863047 BM_SGEMM/16 710 ns 710 ns 2028314 BM_SGEMM/20 871 ns 871 ns 1581546 BM_SGEMM/32 2132 ns 2132 ns 657089 BM_SGEMM/40 3197 ns 3196 ns 437969 BM_SGEMM/64 9645 ns 9645 ns 144987 BM_SGEMM/72 35064 ns 32881 ns 50264 BM_SGEMM/80 37661 ns 35787 ns 42080 BM_SGEMM/90 36507 ns 36077 ns 40091 BM_SGEMM/100 32513 ns 31850 ns 48607 BM_SGEMM/112 41742 ns 41207 ns 37273 BM_SGEMM/128 67211 ns 65095 ns 21933 BM_SGEMM/140 68263 ns 67943 ns 19245 BM_SGEMM/150 121854 ns 115439 ns 10660 BM_SGEMM/160 116826 ns 115539 ns 10000 BM_SGEMM/170 126566 ns 122798 ns 11960 BM_SGEMM/180 130088 ns 127292 ns 11503 BM_SGEMM/189 120309 ns 116634 ns 13162 BM_SGEMM/200 114559 ns 110993 ns 10000 BM_SGEMM/256 217063 ns 207806 ns 6417 ``` and after, it's gone (note this includes my other change which reduces calls to num_cpu_avail): ``` ---------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------- BM_SGEMM/4 95 ns 95 ns 12347650 BM_SGEMM/6 166 ns 166 ns 8259683 BM_SGEMM/8 193 ns 193 ns 7162210 BM_SGEMM/10 258 ns 258 ns 5415657 BM_SGEMM/16 471 ns 471 ns 2981009 BM_SGEMM/20 666 ns 666 ns 2148002 BM_SGEMM/32 1903 ns 1903 ns 738245 BM_SGEMM/40 2969 ns 2969 ns 473239 BM_SGEMM/64 9440 ns 9440 ns 148442 BM_SGEMM/72 37239 ns 33330 ns 46813 BM_SGEMM/80 57350 ns 55949 ns 32251 BM_SGEMM/90 36275 ns 36249 ns 42259 BM_SGEMM/100 31111 ns 31008 ns 45270 BM_SGEMM/112 43782 ns 40912 ns 34749 BM_SGEMM/128 67375 ns 64406 ns 22443 BM_SGEMM/140 76389 ns 67003 ns 21430 BM_SGEMM/150 72952 ns 71830 ns 19793 BM_SGEMM/160 97039 ns 96858 ns 11498 BM_SGEMM/170 123272 ns 122007 ns 11855 BM_SGEMM/180 126828 ns 126505 ns 11567 BM_SGEMM/189 115179 ns 114665 ns 11044 BM_SGEMM/200 89289 ns 87259 ns 16147 BM_SGEMM/256 226252 ns 222677 ns 7375 ``` I've also tested this with ThreadSanitizer and found no data races during execution. I'm not sure why 200 is always faster than it's neighbors, we must be hitting some optimal cache size or something.
7 years ago
9 years ago
9 years ago
9 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. //#undef DEBUG
  66. #include "common.h"
  67. #define NEW_BUFFERS 512
  68. #ifndef likely
  69. #ifdef __GNUC__
  70. #define likely(x) __builtin_expect(!!(x), 1)
  71. #define unlikely(x) __builtin_expect(!!(x), 0)
  72. #else
  73. #define likely(x) (x)
  74. #define unlikely(x) (x)
  75. #endif
  76. #endif
  77. #if defined(USE_TLS) && defined(SMP)
  78. #define COMPILE_TLS
  79. #if USE_TLS != 1
  80. #undef COMPILE_TLS
  81. #endif
  82. #if defined(__GLIBC_PREREQ)
  83. #if !__GLIBC_PREREQ(2,20)
  84. #undef COMPILE_TLS
  85. #endif
  86. #endif
  87. #endif
  88. /* Memory buffer must fit two matrix subblocks of maximal size */
  89. #define XSTR(x) STR(x)
  90. #define STR(x) #x
  91. #if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \
  92. BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \
  93. BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2)
  94. #warning BUFFER_SIZE is too small for P, Q, and R of SGEMM - large calculations may crash !
  95. #endif
  96. #if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \
  97. BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \
  98. BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2)
  99. #warning BUFFER_SIZE is too small for P, Q, and R of DGEMM - large calculations may crash !
  100. #endif
  101. #if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \
  102. BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \
  103. BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2)
  104. #warning BUFFER_SIZE is too small for P, Q, and R of CGEMM - large calculations may crash !
  105. #endif
  106. #if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \
  107. BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \
  108. BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2)
  109. #warning BUFFER_SIZE is too small for P, Q, and R of ZGEMM - large calculations may crash !
  110. #endif
  111. #if defined(COMPILE_TLS)
  112. #include <errno.h>
  113. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  114. #define ALLOC_WINDOWS
  115. #ifndef MEM_LARGE_PAGES
  116. #define MEM_LARGE_PAGES 0x20000000
  117. #endif
  118. #else
  119. #define ALLOC_MMAP
  120. #define ALLOC_MALLOC
  121. #endif
  122. #include <stdlib.h>
  123. #include <stdio.h>
  124. #include <fcntl.h>
  125. #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
  126. #include <sys/mman.h>
  127. #ifndef NO_SYSV_IPC
  128. #include <sys/shm.h>
  129. #endif
  130. #include <sys/ipc.h>
  131. #endif
  132. #include <sys/types.h>
  133. #ifdef OS_LINUX
  134. #include <sys/sysinfo.h>
  135. #include <sched.h>
  136. #include <errno.h>
  137. #include <linux/unistd.h>
  138. #include <sys/syscall.h>
  139. #include <sys/time.h>
  140. #include <sys/resource.h>
  141. #endif
  142. #ifdef OS_HAIKU
  143. #include <unistd.h>
  144. #endif
  145. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
  146. #include <sys/sysctl.h>
  147. #include <sys/resource.h>
  148. #endif
  149. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  150. #include <conio.h>
  151. #undef printf
  152. #define printf _cprintf
  153. #endif
  154. #ifdef OS_LINUX
  155. #ifndef MPOL_PREFERRED
  156. #define MPOL_PREFERRED 1
  157. #endif
  158. #endif
  159. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  160. #define NO_WARMUP
  161. #endif
  162. #ifndef SHM_HUGETLB
  163. #define SHM_HUGETLB 04000
  164. #endif
  165. #ifndef FIXED_PAGESIZE
  166. #define FIXED_PAGESIZE 4096
  167. #endif
  168. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  169. #if defined(_MSC_VER) && !defined(__clang__)
  170. #define CONSTRUCTOR __cdecl
  171. #define DESTRUCTOR __cdecl
  172. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
  173. #define CONSTRUCTOR __attribute__ ((constructor))
  174. #define DESTRUCTOR __attribute__ ((destructor))
  175. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
  176. #define CONSTRUCTOR __attribute__ ((constructor(101)))
  177. #define DESTRUCTOR __attribute__ ((destructor(101)))
  178. #else
  179. #define CONSTRUCTOR __attribute__ ((constructor))
  180. #define DESTRUCTOR __attribute__ ((destructor))
  181. #endif
  182. #ifdef DYNAMIC_ARCH
  183. gotoblas_t *gotoblas = NULL;
  184. #endif
  185. extern void openblas_warning(int verbose, const char * msg);
  186. #ifndef SMP
  187. #define blas_cpu_number 1
  188. #define blas_num_threads 1
  189. /* Dummy Function */
  190. int goto_get_num_procs (void) { return 1;};
  191. void goto_set_num_threads(int num_threads) {};
  192. #else
  193. #if defined(OS_LINUX) || defined(OS_SUNOS)
  194. #ifndef NO_AFFINITY
  195. int get_num_procs(void);
  196. #else
  197. int get_num_procs(void) {
  198. static int nums = 0;
  199. int ret;
  200. #if defined(__GLIBC_PREREQ)
  201. cpu_set_t cpuset,*cpusetp;
  202. size_t size;
  203. #if !__GLIBC_PREREQ(2, 7)
  204. int i;
  205. #if !__GLIBC_PREREQ(2, 6)
  206. int n;
  207. #endif
  208. #endif
  209. #endif
  210. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  211. #if defined(USE_OPENMP)
  212. #if _OPENMP >= 201511
  213. int i,n;
  214. n = 0;
  215. ret = omp_get_num_places();
  216. if (ret > 0) for (i=0; i<ret;i++) n+= omp_get_place_num_procs(i);
  217. if (n > 0) nums = n;
  218. #endif
  219. return (nums > 0 ? nums : 2);
  220. #endif
  221. #if !defined(OS_LINUX)
  222. return (nums > 0 ? nums : 2);
  223. #endif
  224. #if !defined(__GLIBC_PREREQ)
  225. return (nums > 0 ? nums :2);
  226. #else
  227. #if !__GLIBC_PREREQ(2, 3)
  228. return (nums > 0 ? nums :2);
  229. #endif
  230. #if !__GLIBC_PREREQ(2, 7)
  231. ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
  232. if (ret!=0) return (nums > 0 ? nums :2);
  233. n=0;
  234. #if !__GLIBC_PREREQ(2, 6)
  235. for (i=0;i<nums;i++)
  236. if (CPU_ISSET(i,&cpuset)) n++;
  237. nums=n;
  238. #else
  239. nums = CPU_COUNT(sizeof(cpuset),&cpuset);
  240. #endif
  241. return (nums > 0 ? nums :2);
  242. #else
  243. if (nums >= CPU_SETSIZE) {
  244. cpusetp = CPU_ALLOC(nums);
  245. if (cpusetp == NULL) {
  246. return (nums > 0 ? nums :2);
  247. }
  248. size = CPU_ALLOC_SIZE(nums);
  249. ret = sched_getaffinity(0,size,cpusetp);
  250. if (ret!=0) {
  251. CPU_FREE(cpusetp);
  252. return (nums > 0 ? nums :2);
  253. }
  254. ret = CPU_COUNT_S(size,cpusetp);
  255. if (ret > 0 && ret < nums) nums = ret;
  256. CPU_FREE(cpusetp);
  257. return (nums > 0 ? nums :2);
  258. } else {
  259. ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
  260. if (ret!=0) {
  261. return (nums > 0 ? nums :2);
  262. }
  263. ret = CPU_COUNT(&cpuset);
  264. if (ret > 0 && ret < nums) nums = ret;
  265. return (nums > 0 ? nums :2);
  266. }
  267. #endif
  268. #endif
  269. }
  270. #endif
  271. #endif
  272. #ifdef OS_ANDROID
  273. int get_num_procs(void) {
  274. static int nums = 0;
  275. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  276. return nums;
  277. }
  278. #endif
  279. #ifdef OS_HAIKU
  280. int get_num_procs(void) {
  281. static int nums = 0;
  282. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  283. return nums;
  284. }
  285. #endif
  286. #ifdef OS_AIX
  287. int get_num_procs(void) {
  288. static int nums = 0;
  289. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  290. return nums;
  291. }
  292. #endif
  293. #ifdef OS_WINDOWS
  294. int get_num_procs(void) {
  295. static int nums = 0;
  296. if (nums == 0) {
  297. SYSTEM_INFO sysinfo;
  298. GetSystemInfo(&sysinfo);
  299. nums = sysinfo.dwNumberOfProcessors;
  300. }
  301. return nums;
  302. }
  303. #endif
  304. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
  305. int get_num_procs(void) {
  306. static int nums = 0;
  307. int m[2];
  308. size_t len;
  309. if (nums == 0) {
  310. m[0] = CTL_HW;
  311. m[1] = HW_NCPU;
  312. len = sizeof(int);
  313. sysctl(m, 2, &nums, &len, NULL, 0);
  314. }
  315. return nums;
  316. }
  317. #endif
  318. #if defined(OS_DARWIN)
  319. int get_num_procs(void) {
  320. static int nums = 0;
  321. size_t len;
  322. if (nums == 0){
  323. len = sizeof(int);
  324. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  325. }
  326. return nums;
  327. }
  328. /*
  329. void set_stack_limit(int limitMB){
  330. int result=0;
  331. struct rlimit rl;
  332. rlim_t StackSize;
  333. StackSize=limitMB*1024*1024;
  334. result=getrlimit(RLIMIT_STACK, &rl);
  335. if(result==0){
  336. if(rl.rlim_cur < StackSize){
  337. rl.rlim_cur=StackSize;
  338. result=setrlimit(RLIMIT_STACK, &rl);
  339. if(result !=0){
  340. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  341. }
  342. }
  343. }
  344. }
  345. */
  346. #endif
  347. /*
  348. OpenBLAS uses the numbers of CPU cores in multithreading.
  349. It can be set by openblas_set_num_threads(int num_threads);
  350. */
  351. int blas_cpu_number = 0;
  352. /*
  353. The numbers of threads in the thread pool.
  354. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  355. */
  356. int blas_num_threads = 0;
  357. int goto_get_num_procs (void) {
  358. return blas_cpu_number;
  359. }
  360. static void blas_memory_init(void);
  361. void openblas_fork_handler(void)
  362. {
  363. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  364. // built with "make USE_OPENMP=0".
  365. // Hanging can still happen when OpenBLAS is built against the libgomp
  366. // implementation of OpenMP. The problem is tracked at:
  367. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  368. // In the mean time build with USE_OPENMP=0 or link against another
  369. // implementation of OpenMP.
  370. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
  371. int err;
  372. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
  373. if(err != 0)
  374. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  375. #endif
  376. }
  377. extern int openblas_num_threads_env(void);
  378. extern int openblas_goto_num_threads_env(void);
  379. extern int openblas_omp_num_threads_env(void);
  380. int blas_get_cpu_number(void){
  381. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
  382. int max_num;
  383. #endif
  384. int blas_goto_num = 0;
  385. int blas_omp_num = 0;
  386. if (blas_num_threads) return blas_num_threads;
  387. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
  388. max_num = get_num_procs();
  389. #endif
  390. // blas_goto_num = 0;
  391. #ifndef USE_OPENMP_UNUSED
  392. blas_goto_num=openblas_num_threads_env();
  393. if (blas_goto_num < 0) blas_goto_num = 0;
  394. if (blas_goto_num == 0) {
  395. blas_goto_num=openblas_goto_num_threads_env();
  396. if (blas_goto_num < 0) blas_goto_num = 0;
  397. }
  398. #endif
  399. // blas_omp_num = 0;
  400. blas_omp_num=openblas_omp_num_threads_env();
  401. if (blas_omp_num < 0) blas_omp_num = 0;
  402. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  403. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  404. else blas_num_threads = MAX_CPU_NUMBER;
  405. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
  406. if (blas_num_threads > max_num) blas_num_threads = max_num;
  407. #endif
  408. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  409. #ifdef DEBUG
  410. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  411. #endif
  412. blas_cpu_number = blas_num_threads;
  413. return blas_num_threads;
  414. }
  415. #endif
  416. int openblas_get_num_procs(void) {
  417. #ifndef SMP
  418. return 1;
  419. #else
  420. return get_num_procs();
  421. #endif
  422. }
  423. int openblas_get_num_threads(void) {
  424. #ifndef SMP
  425. return 1;
  426. #else
  427. // init blas_cpu_number if needed
  428. blas_get_cpu_number();
  429. return blas_cpu_number;
  430. #endif
  431. }
  432. int hugetlb_allocated = 0;
  433. #if defined(OS_WINDOWS)
  434. #define LIKELY_ONE(x) (x)
  435. #else
  436. #define LIKELY_ONE(x) (__builtin_expect(x, 1))
  437. #endif
  438. /* Stores information about the allocation and how to release it */
  439. struct alloc_t {
  440. /* Whether this allocation is being used */
  441. int used;
  442. /* Any special attributes needed when releasing this allocation */
  443. int attr;
  444. /* Function that can properly release this memory */
  445. void (*release_func)(struct alloc_t *);
  446. /* Pad to 64-byte alignment */
  447. char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
  448. };
  449. /* Convenience macros for storing release funcs */
  450. #define STORE_RELEASE_FUNC(address, func) \
  451. if (address != (void *)-1) { \
  452. struct alloc_t *alloc_info = (struct alloc_t *)address; \
  453. alloc_info->release_func = func; \
  454. }
  455. #define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr) \
  456. if (address != (void *)-1) { \
  457. struct alloc_t *alloc_info = (struct alloc_t *)address; \
  458. alloc_info->release_func = func; \
  459. alloc_info->attr = attr; \
  460. }
  461. /* The number of bytes that will be allocated for each buffer. When allocating
  462. memory, we store an alloc_t followed by the actual buffer memory. This means
  463. that each allocation always has its associated alloc_t, without the need
  464. for an auxiliary tracking structure. */
  465. static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
  466. #if defined(SMP)
  467. # if defined(OS_WINDOWS)
  468. static DWORD local_storage_key = 0;
  469. DWORD lsk;
  470. # else
  471. static pthread_key_t local_storage_key = 0;
  472. pthread_key_t lsk;
  473. # endif /* defined(OS_WINDOWS) */
  474. #endif /* defined(SMP) */
  475. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  476. static int hot_alloc = 0;
  477. #endif
  478. /* Global lock for memory allocation */
  479. #if defined(USE_PTHREAD_LOCK)
  480. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  481. #elif defined(USE_PTHREAD_SPINLOCK)
  482. static pthread_spinlock_t alloc_lock = 0;
  483. #else
  484. static BLASULONG alloc_lock = 0UL;
  485. #endif
  486. #if defined(USE_PTHREAD_LOCK)
  487. static pthread_mutex_t key_lock = PTHREAD_MUTEX_INITIALIZER;
  488. #elif defined(USE_PTHREAD_SPINLOCK)
  489. static pthread_spinlock_t key_lock = 0;
  490. #else
  491. static BLASULONG key_lock = 0UL;
  492. #endif
  493. /* Returns a pointer to the start of the per-thread memory allocation data */
  494. static __inline struct alloc_t ** get_memory_table(void) {
  495. #if defined(SMP)
  496. LOCK_COMMAND(&key_lock);
  497. lsk=local_storage_key;
  498. UNLOCK_COMMAND(&key_lock);
  499. if (!lsk) {
  500. blas_memory_init();
  501. }
  502. # if defined(OS_WINDOWS)
  503. struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
  504. # else
  505. struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
  506. # endif /* defined(OS_WINDOWS) */
  507. #else
  508. static struct alloc_t ** local_memory_table = NULL;
  509. #endif /* defined(SMP) */
  510. #if defined (SMP)
  511. LOCK_COMMAND(&key_lock);
  512. lsk=local_storage_key;
  513. UNLOCK_COMMAND(&key_lock);
  514. if (lsk && !local_memory_table) {
  515. #else
  516. if (!local_memory_table) {
  517. #endif /* defined(SMP) */
  518. local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
  519. memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
  520. #if defined(SMP)
  521. # if defined(OS_WINDOWS)
  522. LOCK_COMMAND(&key_lock);
  523. TlsSetValue(local_storage_key, (void*)local_memory_table);
  524. UNLOCK_COMMAND(&key_lock);
  525. # else
  526. LOCK_COMMAND(&key_lock);
  527. pthread_setspecific(local_storage_key, (void*)local_memory_table);
  528. UNLOCK_COMMAND(&key_lock);
  529. # endif /* defined(OS_WINDOWS) */
  530. #endif /* defined(SMP) */
  531. }
  532. return local_memory_table;
  533. }
  534. #ifdef ALLOC_MMAP
  535. static void alloc_mmap_free(struct alloc_t *alloc_info){
  536. if (munmap(alloc_info, allocation_block_size)) {
  537. printf("OpenBLAS : munmap failed\n");
  538. }
  539. }
  540. #ifdef NO_WARMUP
  541. static void *alloc_mmap(void *address){
  542. void *map_address;
  543. if (address){
  544. map_address = mmap(address,
  545. allocation_block_size,
  546. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  547. } else {
  548. map_address = mmap(address,
  549. allocation_block_size,
  550. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  551. }
  552. STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
  553. #ifdef OS_LINUX
  554. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  555. #endif
  556. return map_address;
  557. }
  558. #else
  559. #define BENCH_ITERATION 4
  560. #define SCALING 2
  561. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  562. BLASULONG original, *p;
  563. BLASULONG start, stop, min;
  564. int iter, i, count;
  565. min = (BLASULONG)-1;
  566. original = *(BLASULONG *)(address + size - PAGESIZE);
  567. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  568. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  569. p = (BLASULONG *)address;
  570. count = size / PAGESIZE;
  571. start = rpcc();
  572. for (i = 0; i < count; i ++) {
  573. p = (BLASULONG *)(*p);
  574. }
  575. stop = rpcc();
  576. if (min > stop - start) min = stop - start;
  577. }
  578. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  579. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  580. return min;
  581. }
  582. static void *alloc_mmap(void *address){
  583. void *map_address, *best_address;
  584. BLASULONG best, start, current, original;
  585. BLASULONG allocsize;
  586. if (address){
  587. /* Just give up use advanced operation */
  588. map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  589. #ifdef OS_LINUX
  590. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  591. #endif
  592. } else {
  593. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  594. if (hot_alloc == 0) {
  595. map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  596. #ifdef OS_LINUX
  597. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  598. #endif
  599. } else {
  600. #endif
  601. map_address = mmap(NULL, allocation_block_size * SCALING,
  602. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  603. if (map_address != (void *)-1) {
  604. #ifdef OS_LINUX
  605. #ifdef DEBUG
  606. int ret=0;
  607. ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  608. if(ret==-1){
  609. int errsv=errno;
  610. perror("OpenBLAS alloc_mmap:");
  611. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  612. }
  613. #else
  614. my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  615. #endif
  616. #endif
  617. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  618. start = (BLASULONG)map_address;
  619. current = (SCALING - 1) * allocation_block_size;
  620. original = current;
  621. while(current > 0 && current <= original) {
  622. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  623. start += PAGESIZE;
  624. current -= PAGESIZE;
  625. }
  626. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  627. start = (BLASULONG)map_address;
  628. best = (BLASULONG)-1;
  629. best_address = map_address;
  630. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
  631. current = run_bench(start, allocsize);
  632. if (best > current) {
  633. best = current;
  634. best_address = (void *)start;
  635. }
  636. start += PAGESIZE;
  637. }
  638. if ((BLASULONG)best_address > (BLASULONG)map_address)
  639. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  640. munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
  641. map_address = best_address;
  642. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  643. hot_alloc = 2;
  644. #endif
  645. }
  646. }
  647. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  648. }
  649. #endif
  650. STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
  651. return map_address;
  652. }
  653. #endif
  654. #endif
  655. #ifdef ALLOC_MALLOC
  656. static void alloc_malloc_free(struct alloc_t *alloc_info){
  657. free(alloc_info);
  658. }
  659. static void *alloc_malloc(void *address){
  660. void *map_address;
  661. map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
  662. if (map_address == (void *)NULL) map_address = (void *)-1;
  663. STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
  664. return map_address;
  665. }
  666. #endif
  667. #ifdef ALLOC_QALLOC
  668. void *qalloc(int flags, size_t bytes);
  669. void *qfree (void *address);
  670. #define QNONCACHE 0x1
  671. #define QCOMMS 0x2
  672. #define QFAST 0x4
  673. static void alloc_qalloc_free(struct alloc_t *alloc_info){
  674. qfree(alloc_info);
  675. }
  676. static void *alloc_qalloc(void *address){
  677. void *map_address;
  678. map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
  679. if (map_address == (void *)NULL) map_address = (void *)-1;
  680. STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
  681. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  682. }
  683. #endif
  684. #ifdef ALLOC_WINDOWS
  685. static void alloc_windows_free(struct alloc_t *alloc_info){
  686. VirtualFree(alloc_info, 0, MEM_RELEASE);
  687. }
  688. static void *alloc_windows(void *address){
  689. void *map_address;
  690. map_address = VirtualAlloc(address,
  691. allocation_block_size,
  692. MEM_RESERVE | MEM_COMMIT,
  693. PAGE_READWRITE);
  694. if (map_address == (void *)NULL) map_address = (void *)-1;
  695. STORE_RELEASE_FUNC(map_address, alloc_windows_free);
  696. return map_address;
  697. }
  698. #endif
  699. #ifdef ALLOC_DEVICEDRIVER
  700. #ifndef DEVICEDRIVER_NAME
  701. #define DEVICEDRIVER_NAME "/dev/mapper"
  702. #endif
  703. static void alloc_devicedirver_free(struct alloc_t *alloc_info){
  704. int attr = alloc_info -> attr;
  705. if (munmap(address, allocation_block_size)) {
  706. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  707. }
  708. if (close(attr)) {
  709. printf("OpenBLAS : Bugphysarea close failed.\n");
  710. }
  711. }
  712. static void *alloc_devicedirver(void *address){
  713. int fd;
  714. void *map_address;
  715. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  716. return (void *)-1;
  717. }
  718. map_address = mmap(address, allocation_block_size,
  719. PROT_READ | PROT_WRITE,
  720. MAP_FILE | MAP_SHARED,
  721. fd, 0);
  722. STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
  723. return map_address;
  724. }
  725. #endif
  726. #ifdef ALLOC_SHM
  727. static void alloc_shm_free(struct alloc_t *alloc_info){
  728. if (shmdt(alloc_info)) {
  729. printf("OpenBLAS : Shared memory unmap failed.\n");
  730. }
  731. }
  732. static void *alloc_shm(void *address){
  733. void *map_address;
  734. int shmid;
  735. shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
  736. map_address = (void *)shmat(shmid, address, 0);
  737. if (map_address != (void *)-1){
  738. #ifdef OS_LINUX
  739. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  740. #endif
  741. shmctl(shmid, IPC_RMID, 0);
  742. struct alloc_t *alloc_info = (struct alloc_t *)map_address;
  743. alloc_info->release_func = alloc_shm_free;
  744. alloc_info->attr = shmid;
  745. }
  746. return map_address;
  747. }
  748. #endif
  749. #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  750. static void alloc_hugetlb_free(struct alloc_t *alloc_info){
  751. #if defined(OS_LINUX) || defined(OS_AIX)
  752. if (shmdt(alloc_info)) {
  753. printf("OpenBLAS : Hugepage unmap failed.\n");
  754. }
  755. #endif
  756. #ifdef __sun__
  757. munmap(alloc_info, allocation_block_size);
  758. #endif
  759. #ifdef OS_WINDOWS
  760. VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
  761. #endif
  762. }
  763. static void *alloc_hugetlb(void *address){
  764. void *map_address = (void *)-1;
  765. #if defined(OS_LINUX) || defined(OS_AIX)
  766. int shmid;
  767. shmid = shmget(IPC_PRIVATE, allocation_block_size,
  768. #ifdef OS_LINUX
  769. SHM_HUGETLB |
  770. #endif
  771. #ifdef OS_AIX
  772. SHM_LGPAGE | SHM_PIN |
  773. #endif
  774. IPC_CREAT | SHM_R | SHM_W);
  775. if (shmid != -1) {
  776. map_address = (void *)shmat(shmid, address, SHM_RND);
  777. #ifdef OS_LINUX
  778. my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
  779. #endif
  780. if (map_address != (void *)-1){
  781. shmctl(shmid, IPC_RMID, 0);
  782. }
  783. }
  784. #endif
  785. #ifdef __sun__
  786. struct memcntl_mha mha;
  787. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  788. mha.mha_flags = 0;
  789. mha.mha_pagesize = HUGE_PAGESIZE;
  790. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  791. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
  792. #endif
  793. #ifdef OS_WINDOWS
  794. HANDLE hToken;
  795. TOKEN_PRIVILEGES tp;
  796. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  797. tp.PrivilegeCount = 1;
  798. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  799. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  800. CloseHandle(hToken);
  801. return (void*)-1;
  802. }
  803. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  804. CloseHandle(hToken);
  805. return (void*)-1;
  806. }
  807. map_address = (void *)VirtualAlloc(address,
  808. allocation_block_size,
  809. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  810. PAGE_READWRITE);
  811. tp.Privileges[0].Attributes = 0;
  812. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  813. if (map_address == (void *)NULL) map_address = (void *)-1;
  814. #endif
  815. STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
  816. return map_address;
  817. }
  818. #endif
  819. #ifdef ALLOC_HUGETLBFILE
  820. static int hugetlb_pid = 0;
  821. static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
  822. int attr = alloc_info -> attr;
  823. if (munmap(alloc_info, allocation_block_size)) {
  824. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  825. }
  826. if (close(attr)) {
  827. printf("OpenBLAS : HugeTLBfs close failed.\n");
  828. }
  829. }
  830. static void *alloc_hugetlbfile(void *address){
  831. void *map_address = (void *)-1;
  832. int fd;
  833. char filename[64];
  834. if (!hugetlb_pid) hugetlb_pid = getpid();
  835. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  836. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  837. return (void *)-1;
  838. }
  839. unlink(filename);
  840. map_address = mmap(address, allocation_block_size,
  841. PROT_READ | PROT_WRITE,
  842. MAP_SHARED,
  843. fd, 0);
  844. STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
  845. return map_address;
  846. }
  847. #endif
  848. #ifdef SEEK_ADDRESS
  849. static BLASULONG base_address = 0UL;
  850. #else
  851. static BLASULONG base_address = BASE_ADDRESS;
  852. #endif
  853. #ifdef HAVE_C11
  854. static _Atomic int memory_initialized = 0;
  855. #else
  856. static volatile int memory_initialized = 0;
  857. #endif
  858. /* Memory allocation routine */
  859. /* procpos ... indicates where it comes from */
  860. /* 0 : Level 3 functions */
  861. /* 1 : Level 2 functions */
  862. /* 2 : Thread */
  863. static void blas_memory_cleanup(void* ptr){
  864. if (ptr) {
  865. struct alloc_t ** table = (struct alloc_t **)ptr;
  866. int pos;
  867. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  868. struct alloc_t *alloc_info = table[pos];
  869. if (alloc_info) {
  870. alloc_info->release_func(alloc_info);
  871. table[pos] = (void *)0;
  872. }
  873. }
  874. free(table);
  875. }
  876. }
  877. static void blas_memory_init(void){
  878. #if defined(SMP)
  879. # if defined(OS_WINDOWS)
  880. local_storage_key = TlsAlloc();
  881. # else
  882. pthread_key_create(&local_storage_key, blas_memory_cleanup);
  883. # endif /* defined(OS_WINDOWS) */
  884. #endif /* defined(SMP) */
  885. }
  886. void *blas_memory_alloc(int procpos){
  887. int position;
  888. void *map_address;
  889. void *(*memoryalloc[])(void *address) = {
  890. #ifdef ALLOC_DEVICEDRIVER
  891. alloc_devicedirver,
  892. #endif
  893. #ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB)
  894. alloc_shm,
  895. #endif
  896. #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  897. alloc_hugetlb,
  898. #endif
  899. #ifdef ALLOC_MMAP
  900. alloc_mmap,
  901. #endif
  902. #ifdef ALLOC_QALLOC
  903. alloc_qalloc,
  904. #endif
  905. #ifdef ALLOC_WINDOWS
  906. alloc_windows,
  907. #endif
  908. #ifdef ALLOC_MALLOC
  909. alloc_malloc,
  910. #endif
  911. NULL,
  912. };
  913. void *(**func)(void *address);
  914. struct alloc_t * alloc_info;
  915. struct alloc_t ** alloc_table;
  916. #if defined(SMP) && !defined(USE_OPENMP)
  917. int mi;
  918. LOCK_COMMAND(&alloc_lock);
  919. mi=memory_initialized;
  920. UNLOCK_COMMAND(&alloc_lock);
  921. if (!LIKELY_ONE(mi)) {
  922. #else
  923. if (!LIKELY_ONE(memory_initialized)) {
  924. #endif
  925. #if defined(SMP) && !defined(USE_OPENMP)
  926. /* Only allow a single thread to initialize memory system */
  927. LOCK_COMMAND(&alloc_lock);
  928. if (!memory_initialized) {
  929. #endif
  930. blas_memory_init();
  931. #ifdef DYNAMIC_ARCH
  932. gotoblas_dynamic_init();
  933. #endif
  934. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  935. gotoblas_affinity_init();
  936. #endif
  937. #ifdef SMP
  938. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  939. #endif
  940. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64)
  941. #ifndef DYNAMIC_ARCH
  942. blas_set_parameter();
  943. #endif
  944. #endif
  945. memory_initialized = 1;
  946. #if defined(SMP) && !defined(USE_OPENMP)
  947. }
  948. UNLOCK_COMMAND(&alloc_lock);
  949. #endif
  950. }
  951. #ifdef DEBUG
  952. printf("Alloc Start ...\n");
  953. #endif
  954. position = 0;
  955. alloc_table = get_memory_table();
  956. do {
  957. if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
  958. position ++;
  959. } while (position < NUM_BUFFERS);
  960. goto error;
  961. allocation :
  962. #ifdef DEBUG
  963. printf(" Position -> %d\n", position);
  964. #endif
  965. alloc_info = alloc_table[position];
  966. if (!alloc_info) {
  967. do {
  968. #ifdef DEBUG
  969. printf("Allocation Start : %lx\n", base_address);
  970. #endif
  971. map_address = (void *)-1;
  972. func = &memoryalloc[0];
  973. while ((*func != NULL) && (map_address == (void *) -1)) {
  974. map_address = (*func)((void *)base_address);
  975. #ifdef ALLOC_DEVICEDRIVER
  976. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  977. fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
  978. }
  979. #endif
  980. #ifdef ALLOC_HUGETLBFILE
  981. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  982. #ifndef OS_WINDOWS
  983. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
  984. #endif
  985. }
  986. #endif
  987. #if (defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  988. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  989. #endif
  990. func ++;
  991. }
  992. #ifdef DEBUG
  993. printf(" Success -> %08lx\n", map_address);
  994. #endif
  995. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  996. if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
  997. } while ((BLASLONG)map_address == -1);
  998. alloc_table[position] = alloc_info = map_address;
  999. #ifdef DEBUG
  1000. printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
  1001. #endif
  1002. }
  1003. #ifdef DEBUG
  1004. printf("Mapped : %p %3d\n\n", (void *)alloc_info, position);
  1005. #endif
  1006. alloc_info->used = 1;
  1007. return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
  1008. error:
  1009. printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
  1010. printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
  1011. printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
  1012. printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
  1013. printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
  1014. printf("cpu cores than what OpenBLAS was configured to handle.\n");
  1015. return NULL;
  1016. }
  1017. void blas_memory_free(void *buffer){
  1018. #ifdef DEBUG
  1019. int position;
  1020. struct alloc_t ** alloc_table;
  1021. #endif
  1022. /* Since we passed an offset pointer to the caller, get back to the actual allocation */
  1023. struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
  1024. #ifdef DEBUG
  1025. printf("Unmapped Start : %p ...\n", alloc_info);
  1026. #endif
  1027. alloc_info->used = 0;
  1028. #ifdef DEBUG
  1029. printf("Unmap Succeeded.\n\n");
  1030. #endif
  1031. return;
  1032. #ifdef DEBUG
  1033. alloc_table = get_memory_table();
  1034. for (position = 0; position < NUM_BUFFERS; position++){
  1035. if (alloc_table[position]) {
  1036. printf("%4ld %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
  1037. }
  1038. }
  1039. #endif
  1040. return;
  1041. }
  1042. void *blas_memory_alloc_nolock(int unused) {
  1043. void *map_address;
  1044. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  1045. return map_address;
  1046. }
  1047. void blas_memory_free_nolock(void * map_address) {
  1048. free(map_address);
  1049. }
  1050. #ifdef SMP
  1051. void blas_thread_memory_cleanup(void) {
  1052. blas_memory_cleanup((void*)get_memory_table());
  1053. }
  1054. #endif
  1055. void blas_shutdown(void){
  1056. #ifdef SMP
  1057. BLASFUNC(blas_thread_shutdown)();
  1058. #endif
  1059. #ifdef SMP
  1060. /* Only cleanupIf we were built for threading and TLS was initialized */
  1061. if (local_storage_key)
  1062. #endif
  1063. blas_thread_memory_cleanup();
  1064. #ifdef SEEK_ADDRESS
  1065. base_address = 0UL;
  1066. #else
  1067. base_address = BASE_ADDRESS;
  1068. #endif
  1069. return;
  1070. }
  1071. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1072. #ifdef SMP
  1073. #if defined(USE_PTHREAD_LOCK)
  1074. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  1075. #elif defined(USE_PTHREAD_SPINLOCK)
  1076. static pthread_spinlock_t init_lock = 0;
  1077. #else
  1078. static BLASULONG init_lock = 0UL;
  1079. #endif
  1080. #endif
  1081. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  1082. void *sa, void *sb, BLASLONG pos) {
  1083. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  1084. size_t size;
  1085. BLASULONG buffer;
  1086. size = allocation_block_size - PAGESIZE;
  1087. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1088. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1089. if (hot_alloc != 2) {
  1090. #endif
  1091. #ifdef SMP
  1092. LOCK_COMMAND(&init_lock);
  1093. #endif
  1094. while (size > 0) {
  1095. *(int *)buffer = size;
  1096. buffer += PAGESIZE;
  1097. size -= PAGESIZE;
  1098. }
  1099. #ifdef SMP
  1100. UNLOCK_COMMAND(&init_lock);
  1101. #endif
  1102. size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
  1103. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  1104. while (size > 0) {
  1105. *(int *)buffer = size;
  1106. buffer += 64;
  1107. size -= 64;
  1108. }
  1109. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1110. }
  1111. #endif
  1112. #endif
  1113. }
  1114. #ifdef SMP
  1115. static void _init_thread_memory(void *buffer) {
  1116. blas_queue_t queue[MAX_CPU_NUMBER];
  1117. int num_cpu;
  1118. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  1119. blas_queue_init(&queue[num_cpu]);
  1120. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  1121. queue[num_cpu].routine = &_touch_memory;
  1122. queue[num_cpu].args = NULL;
  1123. queue[num_cpu].next = &queue[num_cpu + 1];
  1124. }
  1125. queue[num_cpu - 1].next = NULL;
  1126. queue[0].sa = buffer;
  1127. exec_blas(num_cpu, queue);
  1128. }
  1129. #endif
  1130. static void gotoblas_memory_init(void) {
  1131. void *buffer;
  1132. hot_alloc = 1;
  1133. buffer = (void *)blas_memory_alloc(0);
  1134. #ifdef SMP
  1135. if (blas_cpu_number == 0) blas_get_cpu_number();
  1136. #ifdef SMP_SERVER
  1137. if (blas_server_avail == 0) blas_thread_init();
  1138. #endif
  1139. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  1140. #else
  1141. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  1142. #endif
  1143. blas_memory_free(buffer);
  1144. }
  1145. #endif
  1146. /* Initialization for all function; this function should be called before main */
  1147. static int gotoblas_initialized = 0;
  1148. extern void openblas_read_env(void);
  1149. void CONSTRUCTOR gotoblas_init(void) {
  1150. if (gotoblas_initialized) return;
  1151. #ifdef SMP
  1152. openblas_fork_handler();
  1153. #endif
  1154. openblas_read_env();
  1155. #ifdef PROFILE
  1156. moncontrol (0);
  1157. #endif
  1158. #ifdef DYNAMIC_ARCH
  1159. gotoblas_dynamic_init();
  1160. #endif
  1161. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1162. gotoblas_affinity_init();
  1163. #endif
  1164. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1165. gotoblas_memory_init();
  1166. #endif
  1167. //#if defined(OS_LINUX)
  1168. #if 0
  1169. struct rlimit curlimit;
  1170. if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
  1171. {
  1172. if ( curlimit.rlim_cur != curlimit.rlim_max )
  1173. {
  1174. curlimit.rlim_cur = curlimit.rlim_max;
  1175. setrlimit(RLIMIT_STACK, &curlimit);
  1176. }
  1177. }
  1178. #endif
  1179. #ifdef SMP
  1180. if (blas_cpu_number == 0) blas_get_cpu_number();
  1181. #ifdef SMP_SERVER
  1182. if (blas_server_avail == 0) blas_thread_init();
  1183. #endif
  1184. #endif
  1185. #ifdef FUNCTION_PROFILE
  1186. gotoblas_profile_init();
  1187. #endif
  1188. gotoblas_initialized = 1;
  1189. #ifdef PROFILE
  1190. moncontrol (1);
  1191. #endif
  1192. }
  1193. void DESTRUCTOR gotoblas_quit(void) {
  1194. if (gotoblas_initialized == 0) return;
  1195. blas_shutdown();
  1196. #if defined(SMP)
  1197. #if defined(OS_WINDOWS)
  1198. TlsFree(local_storage_key);
  1199. #else
  1200. pthread_key_delete(local_storage_key);
  1201. #endif
  1202. #endif
  1203. #ifdef PROFILE
  1204. moncontrol (0);
  1205. #endif
  1206. #ifdef FUNCTION_PROFILE
  1207. gotoblas_profile_quit();
  1208. #endif
  1209. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  1210. gotoblas_affinity_quit();
  1211. #endif
  1212. #ifdef DYNAMIC_ARCH
  1213. gotoblas_dynamic_quit();
  1214. #endif
  1215. gotoblas_initialized = 0;
  1216. #ifdef PROFILE
  1217. moncontrol (1);
  1218. #endif
  1219. }
  1220. #if defined(_MSC_VER) && !defined(__clang__)
  1221. BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
  1222. {
  1223. switch (ul_reason_for_call)
  1224. {
  1225. case DLL_PROCESS_ATTACH:
  1226. gotoblas_init();
  1227. break;
  1228. case DLL_THREAD_ATTACH:
  1229. break;
  1230. case DLL_THREAD_DETACH:
  1231. #if defined(SMP)
  1232. blas_thread_memory_cleanup();
  1233. #endif
  1234. break;
  1235. case DLL_PROCESS_DETACH:
  1236. gotoblas_quit();
  1237. break;
  1238. default:
  1239. break;
  1240. }
  1241. return TRUE;
  1242. }
  1243. /*
  1244. This is to allow static linking.
  1245. Code adapted from Google performance tools:
  1246. https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
  1247. Reference:
  1248. https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
  1249. http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
  1250. */
  1251. static int on_process_term(void)
  1252. {
  1253. gotoblas_quit();
  1254. return 0;
  1255. }
  1256. #ifdef _WIN64
  1257. #pragma comment(linker, "/INCLUDE:_tls_used")
  1258. #else
  1259. #pragma comment(linker, "/INCLUDE:__tls_used")
  1260. #endif
  1261. #ifdef _WIN64
  1262. #pragma const_seg(".CRT$XLB")
  1263. #else
  1264. #pragma data_seg(".CRT$XLB")
  1265. #endif
  1266. #ifdef _WIN64
  1267. static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  1268. #pragma const_seg()
  1269. #else
  1270. static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  1271. #pragma data_seg()
  1272. #endif
  1273. #ifdef _WIN64
  1274. #pragma const_seg(".CRT$XTU")
  1275. #else
  1276. #pragma data_seg(".CRT$XTU")
  1277. #endif
  1278. #ifdef _WIN64
  1279. static const int(*p_process_term)(void) = on_process_term;
  1280. #pragma const_seg()
  1281. #else
  1282. static int(*p_process_term)(void) = on_process_term;
  1283. #pragma data_seg()
  1284. #endif
  1285. #endif
  1286. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  1287. /* Don't call me; this is just work around for PGI / Sun bug */
  1288. void gotoblas_dummy_for_PGI(void) {
  1289. gotoblas_init();
  1290. gotoblas_quit();
  1291. #if __PGIC__ < 19
  1292. #if 0
  1293. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  1294. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  1295. #else
  1296. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  1297. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  1298. #endif
  1299. #endif
  1300. }
  1301. #endif
  1302. #else
  1303. /* USE_TLS / COMPILE_TLS not set */
  1304. #include <errno.h>
  1305. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  1306. #define ALLOC_WINDOWS
  1307. #ifndef MEM_LARGE_PAGES
  1308. #define MEM_LARGE_PAGES 0x20000000
  1309. #endif
  1310. #elif !defined(OS_EMBEDDED)
  1311. #define ALLOC_MMAP
  1312. #define ALLOC_MALLOC
  1313. #else
  1314. #define ALLOC_MALLOC
  1315. inline int puts(const char *str) { return 0; }
  1316. inline int printf(const char *format, ...) { return 0; }
  1317. inline char *getenv(const char *name) { return ""; }
  1318. inline int atoi(const char *str) { return 0; }
  1319. #endif
  1320. #include <stdlib.h>
  1321. #include <stdio.h>
  1322. #include <fcntl.h>
  1323. #if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
  1324. #include <sys/mman.h>
  1325. #ifndef NO_SYSV_IPC
  1326. #include <sys/shm.h>
  1327. #endif
  1328. #include <sys/ipc.h>
  1329. #endif
  1330. #include <sys/types.h>
  1331. #ifdef OS_LINUX
  1332. #include <sys/sysinfo.h>
  1333. #include <sched.h>
  1334. #include <errno.h>
  1335. #include <sys/syscall.h>
  1336. #include <sys/time.h>
  1337. #include <sys/resource.h>
  1338. #endif
  1339. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
  1340. #include <sys/sysctl.h>
  1341. #include <sys/resource.h>
  1342. #endif
  1343. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
  1344. #include <conio.h>
  1345. #undef printf
  1346. #define printf _cprintf
  1347. #endif
  1348. #ifdef OS_LINUX
  1349. #ifndef MPOL_PREFERRED
  1350. #define MPOL_PREFERRED 1
  1351. #endif
  1352. #endif
  1353. #if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
  1354. #define NO_WARMUP
  1355. #endif
  1356. #ifndef SHM_HUGETLB
  1357. #define SHM_HUGETLB 04000
  1358. #endif
  1359. #ifndef FIXED_PAGESIZE
  1360. #define FIXED_PAGESIZE 4096
  1361. #endif
  1362. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  1363. #if defined(_MSC_VER) && !defined(__clang__)
  1364. #define CONSTRUCTOR __cdecl
  1365. #define DESTRUCTOR __cdecl
  1366. #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
  1367. #define CONSTRUCTOR __attribute__ ((constructor))
  1368. #define DESTRUCTOR __attribute__ ((destructor))
  1369. #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
  1370. #define CONSTRUCTOR __attribute__ ((constructor(101)))
  1371. #define DESTRUCTOR __attribute__ ((destructor(101)))
  1372. #else
  1373. #define CONSTRUCTOR __attribute__ ((constructor))
  1374. #define DESTRUCTOR __attribute__ ((destructor))
  1375. #endif
  1376. #ifdef DYNAMIC_ARCH
  1377. gotoblas_t *gotoblas = NULL;
  1378. #endif
  1379. extern void openblas_warning(int verbose, const char * msg);
  1380. #ifndef SMP
  1381. #define blas_cpu_number 1
  1382. #define blas_num_threads 1
  1383. /* Dummy Function */
  1384. int goto_get_num_procs (void) { return 1;};
  1385. void goto_set_num_threads(int num_threads) {};
  1386. #else
  1387. #if defined(OS_LINUX) || defined(OS_SUNOS)
  1388. #ifndef NO_AFFINITY
  1389. int get_num_procs(void);
  1390. #else
  1391. int get_num_procs(void) {
  1392. static int nums = 0;
  1393. int ret;
  1394. #if defined(__GLIBC_PREREQ)
  1395. cpu_set_t cpuset,*cpusetp;
  1396. size_t size;
  1397. #if !__GLIBC_PREREQ(2, 7)
  1398. int i;
  1399. #if !__GLIBC_PREREQ(2, 6)
  1400. int n;
  1401. #endif
  1402. #endif
  1403. #endif
  1404. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1405. #if defined(USE_OPENMP)
  1406. /* if (omp_get_proc_bind() != omp_proc_bind_false) */
  1407. #if _OPENMP >= 201511
  1408. int i,n;
  1409. n = 0;
  1410. ret = omp_get_num_places();
  1411. if (ret > 0) for (i=0;i<ret;i++) n+= omp_get_place_num_procs(i);
  1412. if (n > 0) nums = n;
  1413. #endif
  1414. return (nums > 0 ? nums :2);
  1415. #endif
  1416. #if !defined(OS_LINUX)
  1417. return (nums > 0 ? nums :2);
  1418. #endif
  1419. #if !defined(__GLIBC_PREREQ)
  1420. return (nums > 0 ? nums :2);
  1421. #else
  1422. #if !__GLIBC_PREREQ(2, 3)
  1423. return (nums > 0 ? nums :2);
  1424. #endif
  1425. #if !__GLIBC_PREREQ(2, 7)
  1426. ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
  1427. if (ret!=0) return (nums > 0 ? nums :2);
  1428. n=0;
  1429. #if !__GLIBC_PREREQ(2, 6)
  1430. for (i=0;i<(nums > 0 ? nums :2);i++)
  1431. if (CPU_ISSET(i,&cpuset)) n++;
  1432. nums=n;
  1433. #else
  1434. nums = CPU_COUNT(sizeof(cpuset),&cpuset);
  1435. #endif
  1436. return (nums > 0 ? nums :2);
  1437. #else
  1438. if (nums >= CPU_SETSIZE) {
  1439. cpusetp = CPU_ALLOC(nums);
  1440. if (cpusetp == NULL) {
  1441. return (nums > 0 ? nums :2);
  1442. }
  1443. size = CPU_ALLOC_SIZE(nums);
  1444. ret = sched_getaffinity(0,size,cpusetp);
  1445. if (ret!=0) {
  1446. CPU_FREE(cpusetp);
  1447. return (nums > 0 ? nums :2);
  1448. }
  1449. ret = CPU_COUNT_S(size,cpusetp);
  1450. if (ret > 0 && ret < nums) nums = ret;
  1451. CPU_FREE(cpusetp);
  1452. return (nums > 0 ? nums :2);
  1453. } else {
  1454. ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
  1455. if (ret!=0) {
  1456. return (nums > 0 ? nums :2);
  1457. }
  1458. ret = CPU_COUNT(&cpuset);
  1459. if (ret > 0 && ret < nums) nums = ret;
  1460. return (nums > 0 ? nums :2);
  1461. }
  1462. #endif
  1463. #endif
  1464. }
  1465. #endif
  1466. #endif
  1467. #ifdef OS_ANDROID
  1468. int get_num_procs(void) {
  1469. static int nums = 0;
  1470. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1471. return nums;
  1472. }
  1473. #endif
  1474. #ifdef OS_HAIKU
  1475. int get_num_procs(void) {
  1476. static int nums = 0;
  1477. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1478. return nums;
  1479. }
  1480. #endif
  1481. #ifdef OS_AIX
  1482. int get_num_procs(void) {
  1483. static int nums = 0;
  1484. if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
  1485. return nums;
  1486. }
  1487. #endif
  1488. #ifdef OS_WINDOWS
  1489. int get_num_procs(void) {
  1490. static int nums = 0;
  1491. if (nums == 0) {
  1492. SYSTEM_INFO sysinfo;
  1493. GetSystemInfo(&sysinfo);
  1494. nums = sysinfo.dwNumberOfProcessors;
  1495. }
  1496. return nums;
  1497. }
  1498. #endif
  1499. #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
  1500. int get_num_procs(void) {
  1501. static int nums = 0;
  1502. int m[2];
  1503. size_t len;
  1504. if (nums == 0) {
  1505. m[0] = CTL_HW;
  1506. m[1] = HW_NCPU;
  1507. len = sizeof(int);
  1508. sysctl(m, 2, &nums, &len, NULL, 0);
  1509. }
  1510. return nums;
  1511. }
  1512. #endif
  1513. #if defined(OS_DARWIN)
  1514. int get_num_procs(void) {
  1515. static int nums = 0;
  1516. size_t len;
  1517. if (nums == 0){
  1518. len = sizeof(int);
  1519. sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
  1520. }
  1521. return nums;
  1522. }
  1523. /*
  1524. void set_stack_limit(int limitMB){
  1525. int result=0;
  1526. struct rlimit rl;
  1527. rlim_t StackSize;
  1528. StackSize=limitMB*1024*1024;
  1529. result=getrlimit(RLIMIT_STACK, &rl);
  1530. if(result==0){
  1531. if(rl.rlim_cur < StackSize){
  1532. rl.rlim_cur=StackSize;
  1533. result=setrlimit(RLIMIT_STACK, &rl);
  1534. if(result !=0){
  1535. fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
  1536. }
  1537. }
  1538. }
  1539. }
  1540. */
  1541. #endif
  1542. /*
  1543. OpenBLAS uses the numbers of CPU cores in multithreading.
  1544. It can be set by openblas_set_num_threads(int num_threads);
  1545. */
  1546. int blas_cpu_number = 0;
  1547. /*
  1548. The numbers of threads in the thread pool.
  1549. This value is equal or large than blas_cpu_number. This means some threads are sleep.
  1550. */
  1551. int blas_num_threads = 0;
  1552. int goto_get_num_procs (void) {
  1553. return blas_cpu_number;
  1554. }
  1555. void openblas_fork_handler(void)
  1556. {
  1557. // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
  1558. // built with "make USE_OPENMP=0".
  1559. // Hanging can still happen when OpenBLAS is built against the libgomp
  1560. // implementation of OpenMP. The problem is tracked at:
  1561. // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
  1562. // In the mean time build with USE_OPENMP=0 or link against another
  1563. // implementation of OpenMP.
  1564. #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
  1565. int err;
  1566. err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
  1567. if(err != 0)
  1568. openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
  1569. #endif
  1570. }
  1571. extern int openblas_num_threads_env(void);
  1572. extern int openblas_goto_num_threads_env(void);
  1573. extern int openblas_omp_num_threads_env(void);
  1574. int blas_get_cpu_number(void){
  1575. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
  1576. int max_num;
  1577. #endif
  1578. int blas_goto_num = 0;
  1579. int blas_omp_num = 0;
  1580. if (blas_num_threads) return blas_num_threads;
  1581. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
  1582. max_num = get_num_procs();
  1583. #endif
  1584. // blas_goto_num = 0;
  1585. #ifndef USE_OPENMP
  1586. blas_goto_num=openblas_num_threads_env();
  1587. if (blas_goto_num < 0) blas_goto_num = 0;
  1588. if (blas_goto_num == 0) {
  1589. blas_goto_num=openblas_goto_num_threads_env();
  1590. if (blas_goto_num < 0) blas_goto_num = 0;
  1591. }
  1592. #endif
  1593. // blas_omp_num = 0;
  1594. blas_omp_num=openblas_omp_num_threads_env();
  1595. if (blas_omp_num < 0) blas_omp_num = 0;
  1596. if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
  1597. else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
  1598. else blas_num_threads = MAX_CPU_NUMBER;
  1599. #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
  1600. if (blas_num_threads > max_num) blas_num_threads = max_num;
  1601. #endif
  1602. if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
  1603. #ifdef DEBUG
  1604. printf( "Adjusted number of threads : %3d\n", blas_num_threads);
  1605. #endif
  1606. blas_cpu_number = blas_num_threads;
  1607. return blas_num_threads;
  1608. }
  1609. #endif
  1610. int openblas_get_num_procs(void) {
  1611. #ifndef SMP
  1612. return 1;
  1613. #else
  1614. return get_num_procs();
  1615. #endif
  1616. }
  1617. int openblas_get_num_threads(void) {
  1618. #ifndef SMP
  1619. return 1;
  1620. #else
  1621. // init blas_cpu_number if needed
  1622. blas_get_cpu_number();
  1623. return blas_cpu_number;
  1624. #endif
  1625. }
  1626. struct release_t {
  1627. void *address;
  1628. void (*func)(struct release_t *);
  1629. long attr;
  1630. };
  1631. int hugetlb_allocated = 0;
  1632. static struct release_t release_info[NUM_BUFFERS];
  1633. static struct release_t *new_release_info;
  1634. static int release_pos = 0;
  1635. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1636. static int hot_alloc = 0;
  1637. #endif
  1638. /* Global lock for memory allocation */
  1639. #if defined(USE_PTHREAD_LOCK)
  1640. static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
  1641. #elif defined(USE_PTHREAD_SPINLOCK)
  1642. static pthread_spinlock_t alloc_lock = 0;
  1643. #else
  1644. static BLASULONG alloc_lock = 0UL;
  1645. #endif
  1646. #ifdef ALLOC_MMAP
  1647. static void alloc_mmap_free(struct release_t *release){
  1648. if (!release->address) return;
  1649. if (munmap(release -> address, BUFFER_SIZE)) {
  1650. int errsv=errno;
  1651. perror("OpenBLAS : munmap failed:");
  1652. printf("error code=%d,\trelease->address=%p\n",errsv,release->address);
  1653. }
  1654. }
  1655. #ifdef NO_WARMUP
  1656. static void *alloc_mmap(void *address){
  1657. void *map_address;
  1658. if (address){
  1659. map_address = mmap(address,
  1660. BUFFER_SIZE,
  1661. MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  1662. } else {
  1663. map_address = mmap(address,
  1664. BUFFER_SIZE,
  1665. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  1666. }
  1667. if (map_address != (void *)-1) {
  1668. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1669. LOCK_COMMAND(&alloc_lock);
  1670. #endif
  1671. if (likely(release_pos < NUM_BUFFERS)) {
  1672. release_info[release_pos].address = map_address;
  1673. release_info[release_pos].func = alloc_mmap_free;
  1674. } else {
  1675. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1676. new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
  1677. }
  1678. release_pos ++;
  1679. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1680. UNLOCK_COMMAND(&alloc_lock);
  1681. #endif
  1682. } else {
  1683. #ifdef DEBUG
  1684. int errsv=errno;
  1685. perror("OpenBLAS : mmap failed:");
  1686. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  1687. #endif
  1688. }
  1689. #ifdef OS_LINUX
  1690. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1691. #endif
  1692. return map_address;
  1693. }
  1694. #else
  1695. #define BENCH_ITERATION 4
  1696. #define SCALING 2
  1697. static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
  1698. BLASULONG original, *p;
  1699. BLASULONG start, stop, min;
  1700. int iter, i, count;
  1701. min = (BLASULONG)-1;
  1702. original = *(BLASULONG *)(address + size - PAGESIZE);
  1703. *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
  1704. for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
  1705. p = (BLASULONG *)address;
  1706. count = size / PAGESIZE;
  1707. start = rpcc();
  1708. for (i = 0; i < count; i ++) {
  1709. p = (BLASULONG *)(*p);
  1710. }
  1711. stop = rpcc();
  1712. if (min > stop - start) min = stop - start;
  1713. }
  1714. *(BLASULONG *)(address + size - PAGESIZE + 0) = original;
  1715. *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p;
  1716. return min;
  1717. }
  1718. static void *alloc_mmap(void *address){
  1719. void *map_address, *best_address;
  1720. BLASULONG best, start, current;
  1721. BLASULONG allocsize;
  1722. if (address){
  1723. /* Just give up use advanced operation */
  1724. map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
  1725. #ifdef OS_LINUX
  1726. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1727. #endif
  1728. } else {
  1729. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1730. if (hot_alloc == 0) {
  1731. map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
  1732. #ifdef OS_LINUX
  1733. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1734. #endif
  1735. } else {
  1736. #endif
  1737. map_address = mmap(NULL, BUFFER_SIZE * SCALING,
  1738. MMAP_ACCESS, MMAP_POLICY, -1, 0);
  1739. if (map_address != (void *)-1) {
  1740. #ifdef OS_LINUX
  1741. #ifdef DEBUG
  1742. int ret=0;
  1743. ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  1744. if(ret==-1){
  1745. int errsv=errno;
  1746. perror("OpenBLAS alloc_mmap:");
  1747. printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
  1748. }
  1749. #else
  1750. my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
  1751. #endif
  1752. #endif
  1753. #ifdef BUILD_DOUBLE
  1754. allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
  1755. #elif defined(BUILD_COMPLEX16)
  1756. allocsize = ZGEMM_P * ZGEMM_Q * sizeof(double);
  1757. #elif defined(BUILD_COMPLEX)
  1758. allocsize = CGEMM_P * CGEMM_Q * sizeof(double);
  1759. #else
  1760. allocsize = SGEMM_P * SGEMM_Q * sizeof(double);
  1761. #endif
  1762. start = (BLASULONG)map_address;
  1763. current = (SCALING - 1) * BUFFER_SIZE;
  1764. while(current > 0) {
  1765. *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
  1766. start += PAGESIZE;
  1767. current -= PAGESIZE;
  1768. }
  1769. *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
  1770. start = (BLASULONG)map_address;
  1771. best = (BLASULONG)-1;
  1772. best_address = map_address;
  1773. while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
  1774. current = run_bench(start, allocsize);
  1775. if (best > current) {
  1776. best = current;
  1777. best_address = (void *)start;
  1778. }
  1779. start += PAGESIZE;
  1780. }
  1781. if ((BLASULONG)best_address > (BLASULONG)map_address)
  1782. munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address);
  1783. munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
  1784. map_address = best_address;
  1785. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1786. hot_alloc = 2;
  1787. #endif
  1788. }
  1789. }
  1790. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  1791. }
  1792. #endif
  1793. if (map_address != (void *)-1) {
  1794. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1795. LOCK_COMMAND(&alloc_lock);
  1796. #endif
  1797. if (likely(release_pos < NUM_BUFFERS)) {
  1798. release_info[release_pos].address = map_address;
  1799. release_info[release_pos].func = alloc_mmap_free;
  1800. } else {
  1801. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1802. new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
  1803. }
  1804. release_pos ++;
  1805. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  1806. UNLOCK_COMMAND(&alloc_lock);
  1807. #endif
  1808. }
  1809. return map_address;
  1810. }
  1811. #endif
  1812. #endif
  1813. #ifdef ALLOC_MALLOC
  1814. static void alloc_malloc_free(struct release_t *release){
  1815. free(release -> address);
  1816. }
  1817. static void *alloc_malloc(void *address){
  1818. void *map_address;
  1819. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  1820. if (map_address == (void *)NULL) map_address = (void *)-1;
  1821. if (map_address != (void *)-1) {
  1822. if (likely(release_pos < NUM_BUFFERS)) {
  1823. release_info[release_pos].address = map_address;
  1824. release_info[release_pos].func = alloc_malloc_free;
  1825. } else {
  1826. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1827. new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free;
  1828. }
  1829. release_pos ++;
  1830. }
  1831. return map_address;
  1832. }
  1833. #endif
  1834. #ifdef ALLOC_QALLOC
  1835. void *qalloc(int flags, size_t bytes);
  1836. void *qfree (void *address);
  1837. #define QNONCACHE 0x1
  1838. #define QCOMMS 0x2
  1839. #define QFAST 0x4
  1840. static void alloc_qalloc_free(struct release_t *release){
  1841. qfree(release -> address);
  1842. }
  1843. static void *alloc_qalloc(void *address){
  1844. void *map_address;
  1845. map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
  1846. if (map_address == (void *)NULL) map_address = (void *)-1;
  1847. if (map_address != (void *)-1) {
  1848. if (likely(release_pos < NUM_BUFFERS)) {
  1849. release_info[release_pos].address = map_address;
  1850. release_info[release_pos].func = alloc_qalloc_free;
  1851. } else {
  1852. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1853. new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free;
  1854. }
  1855. release_pos ++;
  1856. }
  1857. return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
  1858. }
  1859. #endif
  1860. #ifdef ALLOC_WINDOWS
  1861. static void alloc_windows_free(struct release_t *release){
  1862. VirtualFree(release -> address, 0, MEM_RELEASE);
  1863. }
  1864. static void *alloc_windows(void *address){
  1865. void *map_address;
  1866. map_address = VirtualAlloc(address,
  1867. BUFFER_SIZE,
  1868. MEM_RESERVE | MEM_COMMIT,
  1869. PAGE_READWRITE);
  1870. if (map_address == (void *)NULL) map_address = (void *)-1;
  1871. if (map_address != (void *)-1) {
  1872. if (likely(release_pos < NUM_BUFFERS)) {
  1873. release_info[release_pos].address = map_address;
  1874. release_info[release_pos].func = alloc_windows_free;
  1875. } else {
  1876. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1877. new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free;
  1878. }
  1879. release_pos ++;
  1880. }
  1881. return map_address;
  1882. }
  1883. #endif
  1884. #ifdef ALLOC_DEVICEDRIVER
  1885. #ifndef DEVICEDRIVER_NAME
  1886. #define DEVICEDRIVER_NAME "/dev/mapper"
  1887. #endif
  1888. static void alloc_devicedirver_free(struct release_t *release){
  1889. if (munmap(release -> address, BUFFER_SIZE)) {
  1890. printf("OpenBLAS : Bugphysarea unmap failed.\n");
  1891. }
  1892. if (close(release -> attr)) {
  1893. printf("OpenBLAS : Bugphysarea close failed.\n");
  1894. }
  1895. }
  1896. static void *alloc_devicedirver(void *address){
  1897. int fd;
  1898. void *map_address;
  1899. if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
  1900. return (void *)-1;
  1901. }
  1902. map_address = mmap(address, BUFFER_SIZE,
  1903. PROT_READ | PROT_WRITE,
  1904. MAP_FILE | MAP_SHARED,
  1905. fd, 0);
  1906. if (map_address != (void *)-1) {
  1907. if (likely(release_pos < NUM_BUFFERS)) {
  1908. release_info[release_pos].address = map_address;
  1909. release_info[release_pos].attr = fd;
  1910. release_info[release_pos].func = alloc_devicedirver_free;
  1911. } else {
  1912. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1913. new_release_info[release_pos-NUM_BUFFERS].attr = fd;
  1914. new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free;
  1915. }
  1916. release_pos ++;
  1917. }
  1918. return map_address;
  1919. }
  1920. #endif
  1921. #if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
  1922. static void alloc_shm_free(struct release_t *release){
  1923. if (shmdt(release -> address)) {
  1924. printf("OpenBLAS : Shared memory unmap failed.\n");
  1925. }
  1926. }
  1927. static void *alloc_shm(void *address){
  1928. void *map_address;
  1929. int shmid;
  1930. #ifdef DEBUG
  1931. fprintf(stderr,"alloc_shm got called\n");
  1932. #endif
  1933. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
  1934. map_address = (void *)shmat(shmid, address, 0);
  1935. if (map_address != (void *)-1){
  1936. #ifdef OS_LINUX
  1937. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1938. #endif
  1939. shmctl(shmid, IPC_RMID, 0);
  1940. if (likely(release_pos < NUM_BUFFERS)) {
  1941. release_info[release_pos].address = map_address;
  1942. release_info[release_pos].attr = shmid;
  1943. release_info[release_pos].func = alloc_shm_free;
  1944. } else {
  1945. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  1946. new_release_info[release_pos-NUM_BUFFERS].attr = shmid;
  1947. new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free;
  1948. }
  1949. release_pos ++;
  1950. }
  1951. return map_address;
  1952. }
  1953. #endif
  1954. #if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
  1955. static void alloc_hugetlb_free(struct release_t *release){
  1956. #if defined(OS_LINUX) || defined(OS_AIX)
  1957. if (shmdt(release -> address)) {
  1958. printf("OpenBLAS : Hugepage unmap failed.\n");
  1959. }
  1960. #endif
  1961. #ifdef __sun__
  1962. munmap(release -> address, BUFFER_SIZE);
  1963. #endif
  1964. #ifdef OS_WINDOWS
  1965. VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
  1966. #endif
  1967. }
  1968. static void *alloc_hugetlb(void *address){
  1969. void *map_address = (void *)-1;
  1970. #ifdef DEBUG
  1971. fprintf(stderr,"alloc_hugetlb got called\n");
  1972. #endif
  1973. #if defined(OS_LINUX) || defined(OS_AIX)
  1974. int shmid;
  1975. shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
  1976. #ifdef OS_LINUX
  1977. SHM_HUGETLB |
  1978. #endif
  1979. #ifdef OS_AIX
  1980. SHM_LGPAGE | SHM_PIN |
  1981. #endif
  1982. IPC_CREAT | SHM_R | SHM_W);
  1983. if (shmid != -1) {
  1984. map_address = (void *)shmat(shmid, address, SHM_RND);
  1985. #ifdef OS_LINUX
  1986. my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
  1987. #endif
  1988. if (map_address != (void *)-1){
  1989. shmctl(shmid, IPC_RMID, 0);
  1990. }else printf("alloc_hugetlb failed\n");
  1991. }
  1992. #endif
  1993. #ifdef __sun__
  1994. struct memcntl_mha mha;
  1995. mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
  1996. mha.mha_flags = 0;
  1997. mha.mha_pagesize = HUGE_PAGESIZE;
  1998. memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
  1999. map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
  2000. #endif
  2001. #ifdef OS_WINDOWS
  2002. HANDLE hToken;
  2003. TOKEN_PRIVILEGES tp;
  2004. if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
  2005. tp.PrivilegeCount = 1;
  2006. tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
  2007. if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
  2008. CloseHandle(hToken);
  2009. return (void*)-1;
  2010. }
  2011. if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
  2012. CloseHandle(hToken);
  2013. return (void*)-1;
  2014. }
  2015. map_address = (void *)VirtualAlloc(address,
  2016. BUFFER_SIZE,
  2017. MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
  2018. PAGE_READWRITE);
  2019. tp.Privileges[0].Attributes = 0;
  2020. AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
  2021. if (map_address == (void *)NULL) map_address = (void *)-1;
  2022. #endif
  2023. if (map_address != (void *)-1){
  2024. if (likely(release_pos < NUM_BUFFERS)) {
  2025. release_info[release_pos].address = map_address;
  2026. release_info[release_pos].func = alloc_hugetlb_free;
  2027. } else {
  2028. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  2029. new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free;
  2030. }
  2031. release_pos ++;
  2032. }
  2033. return map_address;
  2034. }
  2035. #endif
  2036. #ifdef ALLOC_HUGETLBFILE
  2037. static int hugetlb_pid = 0;
  2038. static void alloc_hugetlbfile_free(struct release_t *release){
  2039. if (munmap(release -> address, BUFFER_SIZE)) {
  2040. printf("OpenBLAS : HugeTLBfs unmap failed.\n");
  2041. }
  2042. if (close(release -> attr)) {
  2043. printf("OpenBLAS : HugeTLBfs close failed.\n");
  2044. }
  2045. }
  2046. static void *alloc_hugetlbfile(void *address){
  2047. void *map_address = (void *)-1;
  2048. int fd;
  2049. char filename[64];
  2050. if (!hugetlb_pid) hugetlb_pid = getpid();
  2051. sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
  2052. if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
  2053. return (void *)-1;
  2054. }
  2055. unlink(filename);
  2056. map_address = mmap(address, BUFFER_SIZE,
  2057. PROT_READ | PROT_WRITE,
  2058. MAP_SHARED,
  2059. fd, 0);
  2060. if (map_address != (void *)-1) {
  2061. if (likely(release_pos < NUM_BUFFERS)) {
  2062. release_info[release_pos].address = map_address;
  2063. release_info[release_pos].attr = fd;
  2064. release_info[release_pos].func = alloc_hugetlbfile_free;
  2065. } else {
  2066. new_release_info[release_pos-NUM_BUFFERS].address = map_address;
  2067. new_release_info[release_pos-NUM_BUFFERS].attr = fd;
  2068. new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free;
  2069. }
  2070. release_pos ++;
  2071. }
  2072. return map_address;
  2073. }
  2074. #endif
  2075. #ifdef SEEK_ADDRESS
  2076. static BLASULONG base_address = 0UL;
  2077. #else
  2078. static BLASULONG base_address = BASE_ADDRESS;
  2079. #endif
  2080. static volatile struct {
  2081. BLASULONG lock;
  2082. void *addr;
  2083. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2084. int pos;
  2085. #endif
  2086. int used;
  2087. #ifndef __64BIT__
  2088. char dummy[48];
  2089. #else
  2090. char dummy[40];
  2091. #endif
  2092. } memory[NUM_BUFFERS];
  2093. struct newmemstruct
  2094. {
  2095. BLASULONG lock;
  2096. void *addr;
  2097. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2098. int pos;
  2099. #endif
  2100. int used;
  2101. #ifndef __64BIT__
  2102. char dummy[48];
  2103. #else
  2104. char dummy[40];
  2105. #endif
  2106. };
  2107. static volatile struct newmemstruct *newmemory;
  2108. static volatile int memory_initialized = 0;
  2109. static int memory_overflowed = 0;
  2110. /* Memory allocation routine */
  2111. /* procpos ... indicates where it comes from */
  2112. /* 0 : Level 3 functions */
  2113. /* 1 : Level 2 functions */
  2114. /* 2 : Thread */
  2115. void *blas_memory_alloc(int procpos){
  2116. int i;
  2117. int position;
  2118. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2119. int mypos = 0;
  2120. #endif
  2121. void *map_address;
  2122. void *(*memoryalloc[])(void *address) = {
  2123. #ifdef ALLOC_DEVICEDRIVER
  2124. alloc_devicedirver,
  2125. #endif
  2126. #if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
  2127. alloc_shm,
  2128. #endif
  2129. #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
  2130. alloc_hugetlb,
  2131. #endif
  2132. #ifdef ALLOC_MMAP
  2133. alloc_mmap,
  2134. #endif
  2135. #ifdef ALLOC_QALLOC
  2136. alloc_qalloc,
  2137. #endif
  2138. #ifdef ALLOC_WINDOWS
  2139. alloc_windows,
  2140. #endif
  2141. #ifdef ALLOC_MALLOC
  2142. alloc_malloc,
  2143. #endif
  2144. NULL,
  2145. };
  2146. void *(**func)(void *address);
  2147. if (!memory_initialized) {
  2148. #if defined(SMP) && !defined(USE_OPENMP)
  2149. LOCK_COMMAND(&alloc_lock);
  2150. if (!memory_initialized) {
  2151. #endif
  2152. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2153. for (position = 0; position < NUM_BUFFERS; position ++){
  2154. memory[position].addr = (void *)0;
  2155. memory[position].pos = -1;
  2156. memory[position].used = 0;
  2157. memory[position].lock = 0;
  2158. }
  2159. #endif
  2160. #ifdef DYNAMIC_ARCH
  2161. gotoblas_dynamic_init();
  2162. #endif
  2163. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  2164. gotoblas_affinity_init();
  2165. #endif
  2166. #ifdef SMP
  2167. if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
  2168. #endif
  2169. #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64)
  2170. #ifndef DYNAMIC_ARCH
  2171. blas_set_parameter();
  2172. #endif
  2173. #endif
  2174. memory_initialized = 1;
  2175. WMB;
  2176. #if defined(SMP) && !defined(USE_OPENMP)
  2177. }
  2178. UNLOCK_COMMAND(&alloc_lock);
  2179. #endif
  2180. }
  2181. #ifdef DEBUG
  2182. printf("Alloc Start ...\n");
  2183. #endif
  2184. /* #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2185. mypos = WhereAmI();
  2186. position = mypos;
  2187. while (position >= NUM_BUFFERS) position >>= 1;
  2188. do {
  2189. if (!memory[position].used && (memory[position].pos == mypos)) {
  2190. #if defined(SMP) && !defined(USE_OPENMP)
  2191. LOCK_COMMAND(&alloc_lock);
  2192. #else
  2193. blas_lock(&memory[position].lock);
  2194. #endif
  2195. if (!memory[position].used) goto allocation;
  2196. #if defined(SMP) && !defined(USE_OPENMP)
  2197. UNLOCK_COMMAND(&alloc_lock);
  2198. #else
  2199. blas_unlock(&memory[position].lock);
  2200. #endif
  2201. }
  2202. position ++;
  2203. } while (position < NUM_BUFFERS);
  2204. #endif */
  2205. position = 0;
  2206. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2207. LOCK_COMMAND(&alloc_lock);
  2208. #endif
  2209. do {
  2210. RMB;
  2211. #if defined(USE_OPENMP)
  2212. if (!memory[position].used) {
  2213. blas_lock(&memory[position].lock);
  2214. #endif
  2215. if (!memory[position].used) goto allocation;
  2216. #if defined(USE_OPENMP)
  2217. blas_unlock(&memory[position].lock);
  2218. }
  2219. #endif
  2220. position ++;
  2221. } while (position < NUM_BUFFERS);
  2222. if (memory_overflowed) {
  2223. do {
  2224. RMB;
  2225. #if defined(USE_OPENMP)
  2226. if (!newmemory[position-NUM_BUFFERS].used) {
  2227. blas_lock(&newmemory[position-NUM_BUFFERS].lock);
  2228. #endif
  2229. if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
  2230. #if defined(USE_OPENMP)
  2231. blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
  2232. }
  2233. #endif
  2234. position ++;
  2235. } while (position < NEW_BUFFERS + NUM_BUFFERS);
  2236. }
  2237. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2238. UNLOCK_COMMAND(&alloc_lock);
  2239. #endif
  2240. goto error;
  2241. allocation :
  2242. #ifdef DEBUG
  2243. printf(" Position -> %d\n", position);
  2244. #endif
  2245. memory[position].used = 1;
  2246. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2247. UNLOCK_COMMAND(&alloc_lock);
  2248. #else
  2249. blas_unlock(&memory[position].lock);
  2250. #endif
  2251. if (!memory[position].addr) {
  2252. do {
  2253. #ifdef DEBUG
  2254. printf("Allocation Start : %lx\n", base_address);
  2255. #endif
  2256. map_address = (void *)-1;
  2257. func = &memoryalloc[0];
  2258. while ((*func != NULL) && (map_address == (void *) -1)) {
  2259. map_address = (*func)((void *)base_address);
  2260. #ifdef ALLOC_DEVICEDRIVER
  2261. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  2262. fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
  2263. }
  2264. #endif
  2265. #ifdef ALLOC_HUGETLBFILE
  2266. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  2267. #ifndef OS_WINDOWS
  2268. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  2269. #endif
  2270. }
  2271. #endif
  2272. #if (defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  2273. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  2274. #ifdef DEBUG
  2275. if (hugetlb_allocated) printf("allocating via shared memory with large page support (hugetlb)\n");
  2276. #endif
  2277. #endif
  2278. #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  2279. #ifdef DEBUG
  2280. printf("allocating via shared memory\n");
  2281. #endif
  2282. if ((*func == alloc_shm) && (map_address == (void *)-1)) {
  2283. #ifndef OS_WINDOWS
  2284. fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n");
  2285. #endif
  2286. }
  2287. #endif
  2288. func ++;
  2289. }
  2290. #ifdef DEBUG
  2291. printf(" Success -> %08lx\n", map_address);
  2292. #endif
  2293. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  2294. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  2295. } while ((BLASLONG)map_address == -1);
  2296. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2297. LOCK_COMMAND(&alloc_lock);
  2298. #endif
  2299. memory[position].addr = map_address;
  2300. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2301. UNLOCK_COMMAND(&alloc_lock);
  2302. #endif
  2303. #ifdef DEBUG
  2304. printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
  2305. #endif
  2306. }
  2307. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2308. if (memory[position].pos == -1) memory[position].pos = mypos;
  2309. #endif
  2310. #ifdef DYNAMIC_ARCH
  2311. if (memory_initialized == 1) {
  2312. LOCK_COMMAND(&alloc_lock);
  2313. if (memory_initialized == 1) {
  2314. if (!gotoblas) gotoblas_dynamic_init();
  2315. memory_initialized = 2;
  2316. }
  2317. UNLOCK_COMMAND(&alloc_lock);
  2318. }
  2319. #endif
  2320. #ifdef DEBUG
  2321. printf("Mapped : %p %3d\n\n",
  2322. (void *)memory[position].addr, position);
  2323. #endif
  2324. return (void *)memory[position].addr;
  2325. error:
  2326. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2327. LOCK_COMMAND(&alloc_lock);
  2328. #endif
  2329. if (memory_overflowed) goto terminate;
  2330. fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
  2331. fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
  2332. fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER);
  2333. memory_overflowed=1;
  2334. MB;
  2335. new_release_info = (struct release_t*) malloc(NEW_BUFFERS * sizeof(struct release_t));
  2336. newmemory = (struct newmemstruct*) malloc(NEW_BUFFERS * sizeof(struct newmemstruct));
  2337. for (i = 0; i < NEW_BUFFERS; i++) {
  2338. newmemory[i].addr = (void *)0;
  2339. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2340. newmemory[i].pos = -1;
  2341. #endif
  2342. newmemory[i].used = 0;
  2343. newmemory[i].lock = 0;
  2344. }
  2345. allocation2:
  2346. newmemory[position-NUM_BUFFERS].used = 1;
  2347. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2348. UNLOCK_COMMAND(&alloc_lock);
  2349. #else
  2350. blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
  2351. #endif
  2352. do {
  2353. #ifdef DEBUG
  2354. printf("Allocation Start : %lx\n", base_address);
  2355. #endif
  2356. map_address = (void *)-1;
  2357. func = &memoryalloc[0];
  2358. while ((*func != NULL) && (map_address == (void *) -1)) {
  2359. map_address = (*func)((void *)base_address);
  2360. #ifdef ALLOC_DEVICEDRIVER
  2361. if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
  2362. fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
  2363. }
  2364. #endif
  2365. #ifdef ALLOC_HUGETLBFILE
  2366. if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
  2367. #ifndef OS_WINDOWS
  2368. fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
  2369. #endif
  2370. }
  2371. #endif
  2372. #if (defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  2373. #ifdef DEBUG
  2374. fprintf(stderr,"OpenBLAS: allocating via shared memory with large page support (hugetlb)\n");
  2375. #endif
  2376. if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
  2377. #endif
  2378. #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
  2379. #ifdef DEBUG
  2380. fprintf(stderr,"allocating via shared memory\n");
  2381. #endif
  2382. if ((*func == alloc_shm) && (map_address == (void *)-1)) {
  2383. #ifndef OS_WINDOWS
  2384. fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n");
  2385. #endif
  2386. }
  2387. #endif
  2388. func ++;
  2389. }
  2390. #ifdef DEBUG
  2391. printf(" Success -> %08lx\n", map_address);
  2392. #endif
  2393. if (((BLASLONG) map_address) == -1) base_address = 0UL;
  2394. if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
  2395. } while ((BLASLONG)map_address == -1);
  2396. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2397. LOCK_COMMAND(&alloc_lock);
  2398. #endif
  2399. newmemory[position-NUM_BUFFERS].addr = map_address;
  2400. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2401. UNLOCK_COMMAND(&alloc_lock);
  2402. #endif
  2403. #ifdef DEBUG
  2404. printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
  2405. #endif
  2406. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2407. if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
  2408. #endif
  2409. return (void *)newmemory[position-NUM_BUFFERS].addr;
  2410. terminate:
  2411. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2412. UNLOCK_COMMAND(&alloc_lock);
  2413. #endif
  2414. printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
  2415. printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
  2416. printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
  2417. printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
  2418. printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
  2419. printf("cpu cores than what OpenBLAS was configured to handle.\n");
  2420. return NULL;
  2421. }
  2422. void blas_memory_free(void *free_area){
  2423. int position;
  2424. #ifdef DEBUG
  2425. printf("Unmapped Start : %p ...\n", free_area);
  2426. #endif
  2427. position = 0;
  2428. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2429. LOCK_COMMAND(&alloc_lock);
  2430. #endif
  2431. while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
  2432. position++;
  2433. if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
  2434. #ifdef DEBUG
  2435. if (memory[position].addr != free_area) goto error;
  2436. printf(" Position : %d\n", position);
  2437. #endif
  2438. if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
  2439. while ((position < NUM_BUFFERS+NEW_BUFFERS) && (newmemory[position-NUM_BUFFERS].addr != free_area))
  2440. position++;
  2441. // arm: ensure all writes are finished before other thread takes this memory
  2442. WMB;
  2443. if (position - NUM_BUFFERS >= NEW_BUFFERS) goto error;
  2444. newmemory[position-NUM_BUFFERS].used = 0;
  2445. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2446. UNLOCK_COMMAND(&alloc_lock);
  2447. #endif
  2448. #ifdef DEBUG
  2449. printf("Unmap from overflow area succeeded.\n\n");
  2450. #endif
  2451. return;
  2452. } else {
  2453. // arm: ensure all writes are finished before other thread takes this memory
  2454. WMB;
  2455. memory[position].used = 0;
  2456. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2457. UNLOCK_COMMAND(&alloc_lock);
  2458. #endif
  2459. #ifdef DEBUG
  2460. printf("Unmap Succeeded.\n\n");
  2461. #endif
  2462. return;
  2463. }
  2464. error:
  2465. printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
  2466. #ifdef DEBUG
  2467. for (position = 0; position < NUM_BUFFERS; position++)
  2468. printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
  2469. #endif
  2470. #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
  2471. UNLOCK_COMMAND(&alloc_lock);
  2472. #endif
  2473. return;
  2474. }
  2475. void *blas_memory_alloc_nolock(int unused) {
  2476. void *map_address;
  2477. map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
  2478. return map_address;
  2479. }
  2480. void blas_memory_free_nolock(void * map_address) {
  2481. free(map_address);
  2482. }
  2483. void blas_shutdown(void){
  2484. int pos;
  2485. #ifdef SMP
  2486. BLASFUNC(blas_thread_shutdown)();
  2487. #endif
  2488. LOCK_COMMAND(&alloc_lock);
  2489. for (pos = 0; pos < release_pos; pos ++) {
  2490. if (likely(pos < NUM_BUFFERS))
  2491. release_info[pos].func(&release_info[pos]);
  2492. else
  2493. new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
  2494. }
  2495. #ifdef SEEK_ADDRESS
  2496. base_address = 0UL;
  2497. #else
  2498. base_address = BASE_ADDRESS;
  2499. #endif
  2500. for (pos = 0; pos < NUM_BUFFERS; pos ++){
  2501. memory[pos].addr = (void *)0;
  2502. memory[pos].used = 0;
  2503. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2504. memory[pos].pos = -1;
  2505. #endif
  2506. memory[pos].lock = 0;
  2507. }
  2508. if (memory_overflowed) {
  2509. for (pos = 0; pos < NEW_BUFFERS; pos ++){
  2510. newmemory[pos].addr = (void *)0;
  2511. newmemory[pos].used = 0;
  2512. #if defined(WHEREAMI) && !defined(USE_OPENMP)
  2513. newmemory[pos].pos = -1;
  2514. #endif
  2515. newmemory[pos].lock = 0;
  2516. }
  2517. free(newmemory);
  2518. newmemory = NULL;
  2519. memory_overflowed = 0;
  2520. }
  2521. UNLOCK_COMMAND(&alloc_lock);
  2522. return;
  2523. }
  2524. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2525. #if defined(SMP) || defined(USE_LOCKING)
  2526. #if defined(USE_PTHREAD_LOCK)
  2527. static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
  2528. #elif defined(USE_PTHREAD_SPINLOCK)
  2529. static pthread_spinlock_t init_lock = 0;
  2530. #else
  2531. static BLASULONG init_lock = 0UL;
  2532. #endif
  2533. #endif
  2534. static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
  2535. void *sa, void *sb, BLASLONG pos) {
  2536. #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
  2537. size_t size;
  2538. BLASULONG buffer;
  2539. size = BUFFER_SIZE - PAGESIZE;
  2540. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  2541. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2542. if (hot_alloc != 2) {
  2543. #endif
  2544. #if defined(SMP) || defined(USE_LOCKING)
  2545. LOCK_COMMAND(&init_lock);
  2546. #endif
  2547. while (size > 0) {
  2548. *(int *)buffer = size;
  2549. buffer += PAGESIZE;
  2550. size -= PAGESIZE;
  2551. }
  2552. #if defined(SMP) || defined(USE_LOCKING)
  2553. UNLOCK_COMMAND(&init_lock);
  2554. #endif
  2555. size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
  2556. buffer = (BLASULONG)sa + GEMM_OFFSET_A;
  2557. while (size > 0) {
  2558. *(int *)buffer = size;
  2559. buffer += 64;
  2560. size -= 64;
  2561. }
  2562. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2563. }
  2564. #endif
  2565. #endif
  2566. }
  2567. #ifdef SMP
  2568. static void _init_thread_memory(void *buffer) {
  2569. blas_queue_t queue[MAX_CPU_NUMBER];
  2570. int num_cpu;
  2571. for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
  2572. blas_queue_init(&queue[num_cpu]);
  2573. queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL;
  2574. queue[num_cpu].routine = &_touch_memory;
  2575. queue[num_cpu].args = NULL;
  2576. queue[num_cpu].next = &queue[num_cpu + 1];
  2577. }
  2578. queue[num_cpu - 1].next = NULL;
  2579. queue[0].sa = buffer;
  2580. exec_blas(num_cpu, queue);
  2581. }
  2582. #endif
  2583. static void gotoblas_memory_init(void) {
  2584. void *buffer;
  2585. hot_alloc = 1;
  2586. buffer = (void *)blas_memory_alloc(0);
  2587. #ifdef SMP
  2588. if (blas_cpu_number == 0) blas_get_cpu_number();
  2589. #ifdef SMP_SERVER
  2590. if (blas_server_avail == 0) blas_thread_init();
  2591. #endif
  2592. _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
  2593. #else
  2594. _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
  2595. #endif
  2596. blas_memory_free(buffer);
  2597. }
  2598. #endif
  2599. /* Initialization for all function; this function should be called before main */
  2600. static int gotoblas_initialized = 0;
  2601. extern void openblas_read_env(void);
  2602. void CONSTRUCTOR gotoblas_init(void) {
  2603. if (gotoblas_initialized) return;
  2604. #ifdef SMP
  2605. openblas_fork_handler();
  2606. #endif
  2607. openblas_read_env();
  2608. #ifdef PROFILE
  2609. moncontrol (0);
  2610. #endif
  2611. #ifdef DYNAMIC_ARCH
  2612. gotoblas_dynamic_init();
  2613. #endif
  2614. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  2615. gotoblas_affinity_init();
  2616. #endif
  2617. #if defined(OS_LINUX) && !defined(NO_WARMUP)
  2618. gotoblas_memory_init();
  2619. #endif
  2620. //#if defined(OS_LINUX)
  2621. #if 0
  2622. struct rlimit curlimit;
  2623. if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
  2624. {
  2625. if ( curlimit.rlim_cur != curlimit.rlim_max )
  2626. {
  2627. curlimit.rlim_cur = curlimit.rlim_max;
  2628. setrlimit(RLIMIT_STACK, &curlimit);
  2629. }
  2630. }
  2631. #endif
  2632. #ifdef SMP
  2633. if (blas_cpu_number == 0) blas_get_cpu_number();
  2634. #ifdef SMP_SERVER
  2635. if (blas_server_avail == 0) blas_thread_init();
  2636. #endif
  2637. #endif
  2638. #ifdef FUNCTION_PROFILE
  2639. gotoblas_profile_init();
  2640. #endif
  2641. gotoblas_initialized = 1;
  2642. #ifdef PROFILE
  2643. moncontrol (1);
  2644. #endif
  2645. }
  2646. void DESTRUCTOR gotoblas_quit(void) {
  2647. if (gotoblas_initialized == 0) return;
  2648. blas_shutdown();
  2649. #ifdef PROFILE
  2650. moncontrol (0);
  2651. #endif
  2652. #ifdef FUNCTION_PROFILE
  2653. gotoblas_profile_quit();
  2654. #endif
  2655. #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
  2656. gotoblas_affinity_quit();
  2657. #endif
  2658. #ifdef DYNAMIC_ARCH
  2659. gotoblas_dynamic_quit();
  2660. #endif
  2661. gotoblas_initialized = 0;
  2662. #ifdef PROFILE
  2663. moncontrol (1);
  2664. #endif
  2665. }
  2666. #if defined(_MSC_VER) && !defined(__clang__)
  2667. BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
  2668. {
  2669. switch (ul_reason_for_call)
  2670. {
  2671. case DLL_PROCESS_ATTACH:
  2672. gotoblas_init();
  2673. break;
  2674. case DLL_THREAD_ATTACH:
  2675. break;
  2676. case DLL_THREAD_DETACH:
  2677. break;
  2678. case DLL_PROCESS_DETACH:
  2679. gotoblas_quit();
  2680. break;
  2681. default:
  2682. break;
  2683. }
  2684. return TRUE;
  2685. }
  2686. /*
  2687. This is to allow static linking.
  2688. Code adapted from Google performance tools:
  2689. https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
  2690. Reference:
  2691. https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
  2692. http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
  2693. */
  2694. static int on_process_term(void)
  2695. {
  2696. gotoblas_quit();
  2697. return 0;
  2698. }
  2699. #ifdef _WIN64
  2700. #pragma comment(linker, "/INCLUDE:_tls_used")
  2701. #else
  2702. #pragma comment(linker, "/INCLUDE:__tls_used")
  2703. #endif
  2704. #ifdef _WIN64
  2705. #pragma const_seg(".CRT$XLB")
  2706. #else
  2707. #pragma data_seg(".CRT$XLB")
  2708. #endif
  2709. static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
  2710. #ifdef _WIN64
  2711. #pragma const_seg()
  2712. #else
  2713. #pragma data_seg()
  2714. #endif
  2715. #ifdef _WIN64
  2716. #pragma const_seg(".CRT$XTU")
  2717. #else
  2718. #pragma data_seg(".CRT$XTU")
  2719. #endif
  2720. static int(*p_process_term)(void) = on_process_term;
  2721. #ifdef _WIN64
  2722. #pragma const_seg()
  2723. #else
  2724. #pragma data_seg()
  2725. #endif
  2726. #endif
  2727. #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
  2728. /* Don't call me; this is just work around for PGI / Sun bug */
  2729. void gotoblas_dummy_for_PGI(void) {
  2730. gotoblas_init();
  2731. gotoblas_quit();
  2732. #if __PGIC__ < 19
  2733. #if 0
  2734. asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
  2735. asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
  2736. #else
  2737. asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
  2738. asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
  2739. #endif
  2740. #endif
  2741. }
  2742. #endif
  2743. #endif