From 311bfde6c19cfbd3eab89fb38d57aa5cf4f82aab Mon Sep 17 00:00:00 2001 From: HugoZHL <657671989@qq.com> Date: Fri, 1 Apr 2022 10:24:42 +0000 Subject: [PATCH 1/2] Modify scripts. --- examples/cnn/main.py | 8 ++++---- examples/cnn/scripts/hetu_16gpu.sh | 3 +-- examples/cnn/scripts/hetu_8gpu.sh | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/examples/cnn/main.py b/examples/cnn/main.py index 1a4a224..e2abcff 100644 --- a/examples/cnn/main.py +++ b/examples/cnn/main.py @@ -44,7 +44,9 @@ if __name__ == "__main__": device_id = 0 print_rank0("Training {} on HETU".format(args.model)) if args.comm_mode in ('AllReduce', 'Hybrid'): - comm, device_id = ht.mpi_nccl_init() + comm = ht.wrapped_mpi_nccl_init() + device_id = comm.dev_id + rank = comm.rank executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0) else: if args.gpu == -1: @@ -197,6 +199,4 @@ if __name__ == "__main__": print_rank0("Validation accuracy = %f" % accuracy) print_rank0("*"*50) print_rank0("Running time of total %d epoch = %fs" % - (args.num_epochs, running_time)) - if args.comm_mode in ('AllReduce', 'Hybrid'): - ht.mpi_nccl_finish(comm) + (args.num_epochs, running_time)) \ No newline at end of file diff --git a/examples/cnn/scripts/hetu_16gpu.sh b/examples/cnn/scripts/hetu_16gpu.sh index 4b4c130..b5df286 100644 --- a/examples/cnn/scripts/hetu_16gpu.sh +++ b/examples/cnn/scripts/hetu_16gpu.sh @@ -5,5 +5,4 @@ mainpy=${workdir}/../main.py depsdir=${workdir}/../../.. echo $depsdir ### validate and timing -$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce - +mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce \ No newline at end of file diff --git a/examples/cnn/scripts/hetu_8gpu.sh b/examples/cnn/scripts/hetu_8gpu.sh index f2a99e3..d45f02c 100644 --- a/examples/cnn/scripts/hetu_8gpu.sh +++ b/examples/cnn/scripts/hetu_8gpu.sh @@ -1,8 +1,8 @@ #!/bin/bash + workdir=$(cd $(dirname $0); pwd) mainpy=${workdir}/../main.py depsdir=${workdir}/../../.. - +echo $depsdir ### validate and timing -# -NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce +NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=$depsdir/python python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce From e0fd6f3fc7f83c45830d7a79d100a8c85645eb7a Mon Sep 17 00:00:00 2001 From: HugoZHL <657671989@qq.com> Date: Fri, 1 Apr 2022 12:21:19 +0000 Subject: [PATCH 2/2] Modify default cmake option. --- cmake/config.example.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/config.example.cmake b/cmake/config.example.cmake index 86224f1..5880189 100644 --- a/cmake/config.example.cmake +++ b/cmake/config.example.cmake @@ -17,7 +17,7 @@ set(HETU_PS ON) # whether to compile geometric module (for GNNs) # pybind11(*), metis(*) required -set(HETU_GEOMETRIC ON) +set(HETU_GEOMETRIC OFF) # whether to compile cache module (for PS) # to enable this, you must turn HETU_PS on @@ -25,8 +25,8 @@ set(HETU_GEOMETRIC ON) set(HETU_CACHE ON) # whether to compile Hetu ML Module -set(HETU_ML ON) -set(HETU_PARALLEL_ML ON) +set(HETU_ML OFF) +set(HETU_PARALLEL_ML OFF) ###################### ### Set paths ########