| @@ -17,7 +17,7 @@ set(HETU_PS ON) | |||||
| # whether to compile geometric module (for GNNs) | # whether to compile geometric module (for GNNs) | ||||
| # pybind11(*), metis(*) required | # pybind11(*), metis(*) required | ||||
| set(HETU_GEOMETRIC ON) | |||||
| set(HETU_GEOMETRIC OFF) | |||||
| # whether to compile cache module (for PS) | # whether to compile cache module (for PS) | ||||
| # to enable this, you must turn HETU_PS on | # to enable this, you must turn HETU_PS on | ||||
| @@ -25,8 +25,8 @@ set(HETU_GEOMETRIC ON) | |||||
| set(HETU_CACHE ON) | set(HETU_CACHE ON) | ||||
| # whether to compile Hetu ML Module | # whether to compile Hetu ML Module | ||||
| set(HETU_ML ON) | |||||
| set(HETU_PARALLEL_ML ON) | |||||
| set(HETU_ML OFF) | |||||
| set(HETU_PARALLEL_ML OFF) | |||||
| ###################### | ###################### | ||||
| ### Set paths ######## | ### Set paths ######## | ||||
| @@ -44,7 +44,9 @@ if __name__ == "__main__": | |||||
| device_id = 0 | device_id = 0 | ||||
| print_rank0("Training {} on HETU".format(args.model)) | print_rank0("Training {} on HETU".format(args.model)) | ||||
| if args.comm_mode in ('AllReduce', 'Hybrid'): | if args.comm_mode in ('AllReduce', 'Hybrid'): | ||||
| comm, device_id = ht.mpi_nccl_init() | |||||
| comm = ht.wrapped_mpi_nccl_init() | |||||
| device_id = comm.dev_id | |||||
| rank = comm.rank | |||||
| executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0) | executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0) | ||||
| else: | else: | ||||
| if args.gpu == -1: | if args.gpu == -1: | ||||
| @@ -197,6 +199,4 @@ if __name__ == "__main__": | |||||
| print_rank0("Validation accuracy = %f" % accuracy) | print_rank0("Validation accuracy = %f" % accuracy) | ||||
| print_rank0("*"*50) | print_rank0("*"*50) | ||||
| print_rank0("Running time of total %d epoch = %fs" % | print_rank0("Running time of total %d epoch = %fs" % | ||||
| (args.num_epochs, running_time)) | |||||
| if args.comm_mode in ('AllReduce', 'Hybrid'): | |||||
| ht.mpi_nccl_finish(comm) | |||||
| (args.num_epochs, running_time)) | |||||
| @@ -5,5 +5,4 @@ mainpy=${workdir}/../main.py | |||||
| depsdir=${workdir}/../../.. | depsdir=${workdir}/../../.. | ||||
| echo $depsdir | echo $depsdir | ||||
| ### validate and timing | ### validate and timing | ||||
| $depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce | |||||
| mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce | |||||
| @@ -1,8 +1,8 @@ | |||||
| #!/bin/bash | #!/bin/bash | ||||
| workdir=$(cd $(dirname $0); pwd) | workdir=$(cd $(dirname $0); pwd) | ||||
| mainpy=${workdir}/../main.py | mainpy=${workdir}/../main.py | ||||
| depsdir=${workdir}/../../.. | depsdir=${workdir}/../../.. | ||||
| echo $depsdir | |||||
| ### validate and timing | ### validate and timing | ||||
| # | |||||
| NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce | |||||
| NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=$depsdir/python python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce | |||||