|
- /*****************************************************************************
- Copyright (c) 2011-2014, The OpenBLAS Project
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
-
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written
- permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- kOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- **********************************************************************************/
-
-
- /*********************************************************************/
- /* Copyright 2009, 2010 The University of Texas at Austin. */
- /* All rights reserved. */
- /* */
- /* Redistribution and use in source and binary forms, with or */
- /* without modification, are permitted provided that the following */
- /* conditions are met: */
- /* */
- /* 1. Redistributions of source code must retain the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer. */
- /* */
- /* 2. Redistributions in binary form must reproduce the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer in the documentation and/or other materials */
- /* provided with the distribution. */
- /* */
- /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
- /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
- /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
- /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
- /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
- /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
- /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
- /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
- /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
- /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
- /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
- /* POSSIBILITY OF SUCH DAMAGE. */
- /* */
- /* The views and conclusions contained in the software and */
- /* documentation are those of the authors and should not be */
- /* interpreted as representing official policies, either expressed */
- /* or implied, of The University of Texas at Austin. */
- /*********************************************************************/
-
- #include "common.h"
-
- #if defined(OS_LINUX) && defined(SMP)
-
- #define _GNU_SOURCE
-
- #include <sys/sysinfo.h>
- #include <sys/syscall.h>
- #include <sys/types.h>
- #include <errno.h>
- #include <sys/shm.h>
- #include <fcntl.h>
- #include <sched.h>
- #include <dirent.h>
- #include <dlfcn.h>
- #include <unistd.h>
- #include <string.h>
-
- #if defined(BIGNUMA)
- // max number of nodes as defined in numa.h
- // max cpus as defined in most sched.h
- // cannot use CPU_SETSIZE directly as some
- // Linux distributors set it to 4096
- #define MAX_NODES 128
- #define MAX_CPUS 1024
- #else
- #define MAX_NODES 16
- #define MAX_CPUS 256
- #endif
-
- #define NCPUBITS (8*sizeof(unsigned long))
- #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
- #define CPUELT(cpu) ((cpu) / NCPUBITS)
- #define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS))
-
-
- #define SH_MAGIC 0x510510
-
- #define CPUMAP_NAME "/sys/devices/system/node/node%d/cpumap"
- #define SHARE_NAME "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map"
- #define NODE_DIR "/sys/devices/system/node"
-
- //#undef DEBUG
-
- /* Private variables */
- typedef struct {
- unsigned long lock;
- unsigned int magic;
- unsigned int shmid;
-
- int num_nodes;
- int num_procs;
- int final_num_procs;
- unsigned long avail [MAX_BITMASK_LEN];
- int avail_count;
- unsigned long cpu_info [MAX_CPUS];
- unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN];
- int cpu_use[MAX_CPUS];
-
- } shm_t;
-
- static cpu_set_t cpu_orig_mask[4];
-
- static int cpu_mapping[MAX_CPUS];
- static int node_mapping[MAX_CPUS * 4];
- static int cpu_sub_mapping[MAX_CPUS];
- static int disable_mapping;
-
- /* Number of cores per nodes */
- static int node_cpu[MAX_NODES];
- static int node_equal = 0;
-
- static shm_t *common = (void *)-1;
- static int shmid, pshmid;
- static void *paddr;
-
- static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask;
- static int lprocmask_count = 0;
- static int numprocs = 1;
- static int numnodes = 1;
-
- #if 1
- #define READ_CPU(x) ( (x) & 0xff)
- #define READ_NODE(x) (((x) >> 8) & 0xff)
- #define READ_CORE(x) (((x) >> 16) & 0xff)
-
- #define WRITE_CPU(x) (x)
- #define WRITE_NODE(x) ((x) << 8)
- #define WRITE_CORE(x) ((x) << 16)
- #else
- #define READ_CPU(x) ( (x) & 0xff)
- #define READ_CORE(x) (((x) >> 8) & 0xff)
- #define READ_NODE(x) (((x) >> 16) & 0xff)
-
- #define WRITE_CPU(x) (x)
- #define WRITE_CORE(x) ((x) << 8)
- #define WRITE_NODE(x) ((x) << 16)
- #endif
-
- static inline int popcount(unsigned long number) {
-
- int count = 0;
-
- while (number > 0) {
- if (number & 1) count ++;
- number >>= 1;
- }
-
- return count;
- }
-
- static inline int rcount(unsigned long number) {
-
- int count = -1;
-
- while ((number > 0) && ((number & 0)) == 0) {
- count ++;
- number >>= 1;
- }
-
- return count;
- }
-
- /***
- Known issue: The number of CPUs/cores should less
- than sizeof(unsigned long). On 64 bits, the limit
- is 64. On 32 bits, it is 32.
- ***/
- static inline void get_cpumap(int node, unsigned long * node_info) {
-
- int infile;
- unsigned long affinity[32];
- char name[160];
- char cpumap[160];
- char *dummy;
- int i=0;
- int count=0;
- int k=0;
-
- sprintf(name, CPUMAP_NAME, node);
-
- infile = open(name, O_RDONLY);
- for(i=0; i<32; i++){
- affinity[i] = 0;
- }
-
- if (infile != -1) {
-
- read(infile, cpumap, sizeof(cpumap));
-
- for(i=0; i<160; i++){
- if(cpumap[i] == '\n')
- break;
- if(cpumap[i] != ','){
- name[k++]=cpumap[i];
-
- //Enough data for Hex
- if(k >= NCPUBITS/4){
- affinity[count++] = strtoul(name, &dummy, 16);
- k=0;
- }
- }
-
- }
- if(k!=0){
- name[k]='\0';
- affinity[count++] = strtoul(name, &dummy, 16);
- // k=0;
- }
- // 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
- // revert the sequence
- for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
- node_info[i]=affinity[count-i-1];
- }
- close(infile);
- }
-
- return ;
- }
-
- static inline void get_share(int cpu, int level, unsigned long * share) {
-
- int infile;
- unsigned long affinity[32];
- char cpumap[160];
- char name[160];
- char *dummy;
- int count=0;
- int i=0,k=0;
- int bitmask_idx = 0;
-
- sprintf(name, SHARE_NAME, cpu, level);
-
- infile = open(name, O_RDONLY);
-
- // Init share
- for(i=0; i<MAX_BITMASK_LEN; i++){
- share[i]=0;
- }
- bitmask_idx = CPUELT(cpu);
- share[bitmask_idx] = CPUMASK(cpu);
-
- if (infile != -1) {
-
- read(infile, cpumap, sizeof(cpumap));
-
- for(i=0; i<160; i++){
- if(cpumap[i] == '\n')
- break;
- if(cpumap[i] != ','){
- name[k++]=cpumap[i];
-
- //Enough data
- if(k >= NCPUBITS/4){
- affinity[count++] = strtoul(name, &dummy, 16);
- k=0;
- }
- }
-
- }
- if(k!=0){
- name[k]='\0';
- affinity[count++] = strtoul(name, &dummy, 16);
- // k=0;
- }
- // 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
- // revert the sequence
- for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
- share[i]=affinity[count-i-1];
- }
-
-
- close(infile);
- }
-
- return ;
- }
-
- static int numa_check(void) {
-
- DIR *dp;
- struct dirent *dir;
- int node;
- int j;
-
- common -> num_nodes = 0;
-
- dp = opendir(NODE_DIR);
-
- if (dp == NULL) {
- common -> num_nodes = 1;
- return 0;
- }
-
- for (node = 0; node < MAX_NODES; node ++) {
- for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0;
- }
-
- while ((dir = readdir(dp)) != NULL) {
- if (strncmp(dir->d_name, "node", 4)==0) {
-
- node = atoi(&dir -> d_name[4]);
-
- if (node > MAX_NODES) {
- fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n");
- exit(1);
- }
-
- common -> num_nodes ++;
- get_cpumap(node, common->node_info[node]);
-
- }
- }
-
- closedir(dp);
-
- if (common -> num_nodes == 1) return 1;
-
- #ifdef DEBUG
- fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
-
- for (node = 0; node < common -> num_nodes; node ++)
- fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]);
- #endif
-
- return common -> num_nodes;
- }
-
- #if defined(__GLIBC_PREREQ)
- #if !__GLIBC_PREREQ(2, 6)
- int sched_getcpu(void)
- {
- int cpu;
- FILE *fp = NULL;
- if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
- return -1;
- if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
- fclose (fp);
- return -1;
- }
- fclose (fp);
- return(cpu);
- }
- #endif
- #endif
-
- static void numa_mapping(void) {
-
- int node, cpu, core;
- int i, j, h;
- unsigned long work, bit;
- int count = 0;
- int bitmask_idx = 0;
- int current_cpu;
- int current_node = 0;
- int cpu_count = 0;
-
- for (node = 0; node < common -> num_nodes; node ++) {
- core = 0;
- for (cpu = 0; cpu < common -> num_procs; cpu ++) {
- bitmask_idx = CPUELT(cpu);
- /*
- * When common->avail[i] = 0x5555555555555555UL (indicating that adjacent logical cores share a physical core),
- * using it as a mask may overlap with the local_cpu_map function's role, leading to only half of the real physical cores being detected.
- */
- #ifdef ARCH_LOONGARCH64
- if (common -> node_info[node][bitmask_idx]) {
- #else
- if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) {
- #endif
- common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
- count ++;
- core ++;
- }
-
- }
- }
-
- #ifdef DEBUG
- fprintf(stderr, "\nFrom /sys ...\n\n");
-
- for (cpu = 0; cpu < count; cpu++)
- fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
- #endif
-
- current_cpu = sched_getcpu();
- for (cpu = 0; cpu < count; cpu++) {
- if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
- current_node = READ_NODE(common -> cpu_info[cpu]);
- break;
- }
- }
- for (i = 0; i < MAX_BITMASK_LEN; i++)
- cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
-
- /*
- * If all the processes can be accommodated in the
- * in the current node itself, then bind to cores
- * from the current node only
- */
- if (numprocs <= cpu_count) {
- /*
- * First sort all the cores in order from the current node.
- * Then take remaining nodes one by one in order,
- * and sort their cores in order.
- */
- for (i = 0; i < count; i++) {
- for (j = 0; j < count - 1; j++) {
- int node_1, node_2;
- int core_1, core_2;
- int swap = 0;
-
- node_1 = READ_NODE(common -> cpu_info[j]);
- node_2 = READ_NODE(common -> cpu_info[j + 1]);
- core_1 = READ_CORE(common -> cpu_info[j]);
- core_2 = READ_CORE(common -> cpu_info[j + 1]);
-
- if (node_1 == node_2) {
- if (core_1 > core_2)
- swap = 1;
- } else {
- if ((node_2 == current_node) ||
- ((node_1 != current_node) && (node_1 > node_2)))
- swap = 1;
- }
- if (swap) {
- unsigned long temp;
-
- temp = common->cpu_info[j];
- common->cpu_info[j] = common->cpu_info[j + 1];
- common->cpu_info[j + 1] = temp;
- }
- }
- }
- } else {
- h = 1;
-
- while (h < count) h = 2 * h + 1;
-
- while (h > 1) {
- h /= 2;
- for (i = h; i < count; i++) {
- work = common -> cpu_info[i];
- bit = CPU_ISSET(i, &cpu_orig_mask[0]);
- j = i - h;
- while (work < common -> cpu_info[j]) {
- common -> cpu_info[j + h] = common -> cpu_info[j];
- if (CPU_ISSET(j, &cpu_orig_mask[0])) {
- CPU_SET(j + h, &cpu_orig_mask[0]);
- } else {
- CPU_CLR(j + h, &cpu_orig_mask[0]);
- }
- j -= h;
- if (j < 0) break;
- }
- common -> cpu_info[j + h] = work;
- if (bit) {
- CPU_SET(j + h, &cpu_orig_mask[0]);
- } else {
- CPU_CLR(j + h, &cpu_orig_mask[0]);
- }
-
- }
- }
- }
-
- #ifdef DEBUG
- fprintf(stderr, "\nSorting ...\n\n");
-
- for (cpu = 0; cpu < count; cpu++)
- fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
- READ_CPU(common -> cpu_info[cpu]),
- READ_CORE(common -> cpu_info[cpu]),
- READ_NODE(common -> cpu_info[cpu]));
- #endif
-
- }
-
- static void disable_hyperthread(void) {
-
- unsigned long share[MAX_BITMASK_LEN];
- int cpu;
- int bitmask_idx = 0;
- int i=0, count=0;
- bitmask_idx = CPUELT(common -> num_procs);
-
- for(i=0; i< bitmask_idx; i++){
- common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL;
- }
- if(CPUMASK(common -> num_procs) != 1){
- common -> avail[count++] = CPUMASK(common -> num_procs) - 1;
- }
- common -> avail_count = count;
-
- /* if(common->num_procs > 64){ */
- /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */
- /* exit(1); */
- /* }else if(common->num_procs == 64){ */
- /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */
- /* }else */
- /* common -> avail = (1UL << common -> num_procs) - 1; */
-
- #ifdef DEBUG
- fprintf(stderr, "\nAvail CPUs : ");
- for(i=0; i<count; i++)
- fprintf(stderr, "%04lx ", common -> avail[i]);
- fprintf(stderr, ".\n");
- #endif
-
- for (cpu = 0; cpu < common -> num_procs; cpu ++) {
-
- get_share(cpu, 1, share);
-
- //When the shared cpu are in different element of share & avail array, this may be a bug.
- for (i = 0; i < count ; i++){
-
- share[i] &= common->avail[i];
-
- if (popcount(share[i]) > 1) {
-
- #ifdef DEBUG
- fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
- cpu, share[i] & ~(CPUMASK(cpu)));
- #endif
-
- common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
- }
- }
- }
- }
-
- static void disable_affinity(void) {
- int i=0;
- int bitmask_idx=0;
- int count=0;
- #ifdef DEBUG
- fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]);
- fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
- #endif
-
- /* if(common->final_num_procs > 64){ */
- /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */
- /* exit(1); */
- /* }else if(common->final_num_procs == 64){ */
- /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */
- /* }else */
- /* lprocmask = (1UL << common -> final_num_procs) - 1; */
-
- bitmask_idx = CPUELT(common -> final_num_procs);
-
- for(i=0; i< bitmask_idx; i++){
- lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL;
- }
- if(CPUMASK(common -> final_num_procs) != 1){
- lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1;
- }
- lprocmask_count = count;
-
- #ifndef USE_OPENMP
- for(i=0; i< count; i++){
- lprocmask[i] &= common->avail[i];
- }
- #endif
-
- #ifdef DEBUG
- fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]);
- #endif
-
- }
-
- static void setup_mempolicy(void) {
-
- int cpu, mynode, maxcpu;
-
- for (cpu = 0; cpu < MAX_NODES; cpu ++) node_cpu[cpu] = 0;
-
- maxcpu = 0;
-
- for (cpu = 0; cpu < numprocs; cpu ++) {
- mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[cpu]]);
-
- lnodemask |= (1UL << mynode);
-
- node_cpu[mynode] ++;
-
- if (maxcpu < node_cpu[mynode]) maxcpu = node_cpu[mynode];
- }
-
- node_equal = 1;
-
- for (cpu = 0; cpu < MAX_NODES; cpu ++) if ((node_cpu[cpu] != 0) && (node_cpu[cpu] != maxcpu)) node_equal = 0;
-
- if (lnodemask) {
-
- #ifdef DEBUG
- fprintf(stderr, "Node mask = %lx\n", lnodemask);
- #endif
-
- my_set_mempolicy(MPOL_INTERLEAVE, &lnodemask, sizeof(lnodemask) * 8);
-
- numnodes = popcount(lnodemask);
- }
- }
-
- static inline int is_dead(int id) {
-
- struct shmid_ds ds;
-
- return shmctl(id, IPC_STAT, &ds);
- }
-
- static int open_shmem(void) {
-
- int try = 0;
-
- int err = 0;
-
- do {
-
- #if defined(BIGNUMA)
- // raised to 32768, enough for 128 nodes and 1024 cups
- shmid = shmget(SH_MAGIC, 32768, 0666);
- #else
- shmid = shmget(SH_MAGIC, 4096, 0666);
- #endif
-
- if (shmid == -1) {
- #if defined(BIGNUMA)
- shmid = shmget(SH_MAGIC, 32768, IPC_CREAT | 0666);
- #else
- shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666);
- #endif
- }
-
- if (shmid == -1) err = errno;
-
- try ++;
-
- } while ((try < 10) && (shmid == -1));
-
- if (shmid == -1) {
- fprintf (stderr, "Obtaining shared memory segment failed in open_shmem: %s\n",strerror(err));
- fprintf (stderr, "Setting CPU affinity not possible without shared memory access.\n");
- return (1);
- }
-
- if (shmid != -1) {
- if ( (common = shmat(shmid, NULL, 0)) == (void*)-1) {
- perror ("Attaching shared memory segment failed in open_shmem");
- fprintf (stderr, "Setting CPU affinity not possible without shared memory access.\n");
- return (1);
- }
- }
- #ifdef DEBUG
- fprintf(stderr, "Shared Memory id = %x Address = %p\n", shmid, common);
- #endif
- return (0);
- }
-
- static int create_pshmem(void) {
-
- pshmid = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);
-
- if (pshmid == -1) {
- perror ("Obtaining shared memory segment failed in create_pshmem");
- fprintf (stderr, "Setting CPU affinity not possible without shared memory access.\n");
- return(1);
- }
-
- if ( (paddr = shmat(pshmid, NULL, 0)) == (void*)-1) {
- perror ("Attaching shared memory segment failed in create_pshmem");
- fprintf (stderr, "Setting CPU affinity not possible without shared memory access.\n");
- return (1);
- }
-
- if (shmctl(pshmid, IPC_RMID, 0) == -1) return (1);
-
- #ifdef DEBUG
- fprintf(stderr, "Private Shared Memory id = %x Address = %p\n", pshmid, paddr);
- #endif
- return(0);
- }
-
- static void local_cpu_map(void) {
-
- int cpu, id, mapping;
- int bitmask_idx = 0;
- cpu = 0;
- mapping = 0;
-
- do {
- id = common -> cpu_use[cpu];
-
- if (id > 0) {
- if (is_dead(id)) common -> cpu_use[cpu] = 0;
- }
-
- bitmask_idx = CPUELT(cpu);
- if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
-
- common -> cpu_use[cpu] = pshmid;
- cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
- cpu_sub_mapping[mapping] = cpu;
-
- mapping ++;
- }
-
- cpu ++;
-
- } while ((mapping < numprocs) && (cpu < common -> final_num_procs));
-
- disable_mapping = 0;
-
- if ((mapping < numprocs) || (numprocs == 1)) {
- for (cpu = 0; cpu < common -> final_num_procs; cpu ++) {
- if (common -> cpu_use[cpu] == pshmid) common -> cpu_use[cpu] = 0;
- }
- disable_mapping = 1;
- }
-
- #ifdef DEBUG
- for (cpu = 0; cpu < numprocs; cpu ++) {
- fprintf(stderr, "Local Mapping : %2d --> %2d (%2d)\n", cpu, cpu_mapping[cpu], cpu_sub_mapping[cpu]);
- }
- #endif
- }
-
- /* Public Functions */
-
- int get_num_procs(void) { return numprocs; }
- int get_num_nodes(void) { return numnodes; }
- int get_node_equal(void) {
-
- return (((blas_cpu_number % numnodes) == 0) && node_equal);
-
- }
-
- int gotoblas_set_affinity(int pos) {
-
- cpu_set_t cpu_mask;
-
- int mynode = 1;
-
- /* if number of threads is larger than initial condition */
- if (pos < 0) {
- sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
- return 0;
- }
-
- if (!disable_mapping) {
-
- mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[pos]]);
-
- #ifdef DEBUG
- fprintf(stderr, "Giving Affinity[%4d %3d] --> %3d My node = %3d\n", getpid(), pos, cpu_mapping[pos], mynode);
- #endif
-
- CPU_ZERO(&cpu_mask);
- CPU_SET (cpu_mapping[pos], &cpu_mask);
-
- sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask);
-
- node_mapping[WhereAmI()] = mynode;
-
- }
-
- return mynode;
- }
-
- int get_node(void) {
-
- if (!disable_mapping) return node_mapping[WhereAmI()];
-
- return 1;
- }
-
- static int initialized = 0;
-
- void gotoblas_affinity_init(void) {
-
- int cpu, num_avail;
- #ifndef USE_OPENMP
- cpu_set_t cpu_mask;
- #endif
- int i;
-
- if (initialized) return;
-
- initialized = 1;
-
- sched_getaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
-
- #ifdef USE_OPENMP
- numprocs = 0;
- #else
- numprocs = readenv_atoi("OPENBLAS_NUM_THREADS");
- if (numprocs == 0) numprocs = readenv_atoi("GOTO_NUM_THREADS");
- #endif
-
- if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS");
-
- if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS");
-
- numnodes = 1;
-
- if (numprocs == 1) {
- disable_mapping = 1;
- return;
- }
-
- if (create_pshmem() != 0) {
- disable_mapping = 1;
- return;
- }
-
- if (open_shmem() != 0) {
- disable_mapping = 1;
- return;
- }
-
- while ((common -> lock) && (common -> magic != SH_MAGIC)) {
- if (is_dead(common -> shmid)) {
- common -> lock = 0;
- common -> shmid = 0;
- common -> magic = 0;
- } else {
- YIELDING;
- }
- }
-
- blas_lock(&common -> lock);
-
- if ((common -> shmid) && is_dead(common -> shmid)) common -> magic = 0;
-
- common -> shmid = pshmid;
-
- if (common -> magic != SH_MAGIC) {
-
- #if defined(__GLIBC_PREREQ)
- #if __GLIBC_PREREQ(2, 7)
- cpu_set_t *cpusetp;
- #else
- cpu_set_t cpuset;
- #endif
- #endif
- int nums;
- int ret;
-
- #ifdef DEBUG
- fprintf(stderr, "Shared Memory Initialization.\n");
- #endif
-
- //returns the number of processors which are currently online
-
- nums = sysconf(_SC_NPROCESSORS_CONF);
-
- #if !defined(__GLIBC_PREREQ)
- common->num_procs = nums >0 ? nums : 2;
- #else
-
- #if !__GLIBC_PREREQ(2, 3)
- common->num_procs = nums >0 ? nums : 2;
- #elif __GLIBC_PREREQ(2, 7)
- cpusetp = CPU_ALLOC(nums>0? nums:1024);
- if (cpusetp == NULL) {
- common->num_procs = nums>0 ? nums: 2;
- } else {
- size_t size;
- size = CPU_ALLOC_SIZE(nums>0? nums: 1024);
- ret = sched_getaffinity(0,size,cpusetp);
- if (ret!=0)
- common->num_procs = nums >0 ? nums : 1;
- else
- common->num_procs = CPU_COUNT_S(size,cpusetp);
- }
- CPU_FREE(cpusetp);
- #else
- ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
- if (ret!=0) {
- common->num_procs = nums >0 ? nums : 2;
- } else {
- #if !__GLIBC_PREREQ(2, 6)
- int i;
- int n = 0;
- for (i=0;i<(nums >0 ?nums:1024) ;i++)
- if (CPU_ISSET(i,&cpuset)) n++;
- common->num_procs = n;
- }
- #else
- common->num_procs = CPU_COUNT(&cpuset);
- }
- #endif
-
- #endif
- #endif
- if(common -> num_procs > MAX_CPUS) {
- fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
- exit(1);
- }
-
- for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
-
- numa_check();
-
- disable_hyperthread();
-
- if (common -> num_nodes > 1) numa_mapping();
-
- #ifdef ARCH_LOONGARCH64
- common -> final_num_procs = common -> num_procs;
- #else
- common -> final_num_procs = 0;
- for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
- #endif
-
- for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
-
- common -> magic = SH_MAGIC;
-
- }
-
- disable_affinity();
-
- num_avail = 0;
- for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]);
-
- if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;
-
- #ifdef DEBUG
- fprintf(stderr, "Number of threads = %d\n", numprocs);
- #endif
-
- local_cpu_map();
-
- blas_unlock(&common -> lock);
-
- #ifndef USE_OPENMP
- if (!disable_mapping) {
-
- #ifdef DEBUG
- fprintf(stderr, "Giving Affinity[%3d] --> %3d\n", 0, cpu_mapping[0]);
- #endif
-
- CPU_ZERO(&cpu_mask);
- CPU_SET (cpu_mapping[0], &cpu_mask);
-
- sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask);
-
- node_mapping[WhereAmI()] = READ_NODE(common -> cpu_info[cpu_sub_mapping[0]]);
-
- setup_mempolicy();
-
- if (readenv_atoi("OPENBLAS_MAIN_FREE") || readenv_atoi("GOTOBLAS_MAIN_FREE")) {
- sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
- }
-
- }
- #endif
-
- #ifdef DEBUG
- fprintf(stderr, "Initialization is done.\n");
- #endif
- }
-
- void gotoblas_affinity_quit(void) {
-
- int i;
- struct shmid_ds ds;
-
- #ifdef DEBUG
- fprintf(stderr, "Terminating ..\n");
- #endif
-
- if ((numprocs == 1) || (initialized == 0)) return;
-
- if (!disable_mapping) {
-
- blas_lock(&common -> lock);
-
- for (i = 0; i < numprocs; i ++) common -> cpu_use[cpu_mapping[i]] = -1;
-
- blas_unlock(&common -> lock);
-
- }
-
- shmctl(shmid, IPC_STAT, &ds);
-
- if (ds.shm_nattch == 1) shmctl(shmid, IPC_RMID, 0);
-
- shmdt(common);
-
- shmdt(paddr);
-
- initialized = 0;
- }
-
- #else
-
- void gotoblas_affinity_init(void) {};
-
- void gotoblas_set_affinity(int threads) {};
-
- void gotoblas_set_affinity2(int threads) {};
-
- void gotoblas_affinity_reschedule(void) {};
-
- int get_num_procs(void) { int num = sysconf(_SC_NPROCESSORS_CONF); return (nums >0 ? nums : 2); }
-
- int get_num_nodes(void) { return 1; }
-
- int get_node(void) { return 1;}
- #endif
-
|