Browse Source

Merge pull request 'Modify scripts for cnn example.' (#2) from AlfredWang/Hetu:master into master

master
Hsword 3 years ago
parent
commit
6df869f592
4 changed files with 11 additions and 12 deletions
  1. +3
    -3
      cmake/config.example.cmake
  2. +4
    -4
      examples/cnn/main.py
  3. +1
    -2
      examples/cnn/scripts/hetu_16gpu.sh
  4. +3
    -3
      examples/cnn/scripts/hetu_8gpu.sh

+ 3
- 3
cmake/config.example.cmake View File

@@ -17,7 +17,7 @@ set(HETU_PS ON)

# whether to compile geometric module (for GNNs)
# pybind11(*), metis(*) required
set(HETU_GEOMETRIC ON)
set(HETU_GEOMETRIC OFF)

# whether to compile cache module (for PS)
# to enable this, you must turn HETU_PS on
@@ -25,8 +25,8 @@ set(HETU_GEOMETRIC ON)
set(HETU_CACHE ON)

# whether to compile Hetu ML Module
set(HETU_ML ON)
set(HETU_PARALLEL_ML ON)
set(HETU_ML OFF)
set(HETU_PARALLEL_ML OFF)

######################
### Set paths ########


+ 4
- 4
examples/cnn/main.py View File

@@ -44,7 +44,9 @@ if __name__ == "__main__":
device_id = 0
print_rank0("Training {} on HETU".format(args.model))
if args.comm_mode in ('AllReduce', 'Hybrid'):
comm, device_id = ht.mpi_nccl_init()
comm = ht.wrapped_mpi_nccl_init()
device_id = comm.dev_id
rank = comm.rank
executor_ctx = ht.gpu(device_id % 8) if args.gpu >= 0 else ht.cpu(0)
else:
if args.gpu == -1:
@@ -197,6 +199,4 @@ if __name__ == "__main__":
print_rank0("Validation accuracy = %f" % accuracy)
print_rank0("*"*50)
print_rank0("Running time of total %d epoch = %fs" %
(args.num_epochs, running_time))
if args.comm_mode in ('AllReduce', 'Hybrid'):
ht.mpi_nccl_finish(comm)
(args.num_epochs, running_time))

+ 1
- 2
examples/cnn/scripts/hetu_16gpu.sh View File

@@ -5,5 +5,4 @@ mainpy=${workdir}/../main.py
depsdir=${workdir}/../../..
echo $depsdir
### validate and timing
$depsdir/build/_deps/openmpi-build/bin/mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce

mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=$depsdir/python -H daim117:8,daim118:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce

+ 3
- 3
examples/cnn/scripts/hetu_8gpu.sh View File

@@ -1,8 +1,8 @@
#!/bin/bash

workdir=$(cd $(dirname $0); pwd)
mainpy=${workdir}/../main.py
depsdir=${workdir}/../../..
echo $depsdir
### validate and timing
#
NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/public/third_party_tests/Athena/python /root/anaconda3/envs/zhl/bin/python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce
NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=$depsdir/python python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce

Loading…
Cancel
Save