Browse Source

Modify scripts.

pull/2/head
HugoZHL 3 years ago
parent
commit
311bfde6c1
3 changed files with 8 additions and 9 deletions
  1. +4
    -4
      examples/cnn/main.py
  2. +1
    -2
      examples/cnn/scripts/hetu_16gpu.sh
  3. +3
    -3
      examples/cnn/scripts/hetu_8gpu.sh

+ 4
- 4
examples/cnn/main.py View File

@@ -44,7 +44,9 @@ if __name__ == "__main__":
device_id = 0
print_rank0("Training {} on HETU".format(args.model))
if args.comm_mode in ('AllReduce', 'Hybrid'):
comm, device_id = ht.mpi_nccl_init()
comm = ht.wrapped_mpi_nccl_init()
device_id = comm.dev_id
rank = comm.rank
executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0)
else:
if args.gpu == -1:
@@ -197,6 +199,4 @@ if __name__ == "__main__":
print_rank0("Validation accuracy = %f" % accuracy)
print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))
if args.comm_mode in ('AllReduce', 'Hybrid'):
ht.mpi_nccl_finish(comm)
(args.num_epochs, running_time))

+ 1
- 2
examples/cnn/scripts/hetu_16gpu.sh View File

@@ -5,5 +5,4 @@ mainpy=${workdir}/../main.py
depsdir=${workdir}/../../..
echo $depsdir
### validate and timing
$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce

mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce

+ 3
- 3
examples/cnn/scripts/hetu_8gpu.sh View File

@@ -1,8 +1,8 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py
depsdir=${workdir}/../../..
echo $depsdir
### validate and timing
#
NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce
NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=$depsdir/python python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce

Loading…
Cancel
Save