| @@ -29,7 +29,7 @@ def render_template(template_file_path, context): | |||
| class TemplateManager: | |||
| """BaseNetwork code generator.""" | |||
| replace_template_suffixes = [('.py-tpl', '.py')] | |||
| replace_template_suffixes = [('.py-tpl', '.py'), ('.sh-tpl', '.sh'), ('.md-tpl', '.md')] | |||
| def __init__(self, template_base_dir, exclude_dirs=None, exclude_files=None): | |||
| self.template_base_dir = template_base_dir | |||
| @@ -70,7 +70,7 @@ class TemplateManager: | |||
| """Generate the network files.""" | |||
| source_files = [] | |||
| template_files = self.get_template_files() | |||
| extensions = tuple(options.get('extensions', '.py')) | |||
| extensions = tuple([new_extension for _, new_extension in self.replace_template_suffixes]) | |||
| for template_file in template_files: | |||
| new_file_path = template_file | |||
| template_file_path = template_file | |||
| @@ -0,0 +1,135 @@ | |||
| # AlexNet Example | |||
| ## Description | |||
| These are examples of training AlexNet with dataset in MindSpore. | |||
| ## Requirements | |||
| - Install [MindSpore](https://www.mindspore.cn/install/en). | |||
| - Download the dataset, the directory structure is as follows: | |||
| {% if dataset=='Cifar10' %} | |||
| CIFAR-10 | |||
| ``` | |||
| └─Data | |||
| ├─test | |||
| │ cifar-10-verify-bin | |||
| │ | |||
| └─train | |||
| cifar-10-batches-bin | |||
| ``` | |||
| {% elif dataset=='ImageNet' %} | |||
| ImageNet | |||
| ``` | |||
| └─Data | |||
| ├─test | |||
| │ validation_preprocess | |||
| │ | |||
| └─train | |||
| ilsvrc | |||
| ``` | |||
| {% endif %} | |||
| ## Structure | |||
| ```shell | |||
| . | |||
| └──alexnet | |||
| ├── README.md | |||
| ├── script | |||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||
| ├── run_eval.sh # launch evaluation | |||
| ├── run_standalone_train.sh # launch standalone training(1 pcs) | |||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(4 pcs) | |||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||
| ├── src | |||
| ├── config.py # parameter configuration | |||
| ├── dataset.py # data preprocessing | |||
| ├── generator_lr.py # generate learning rate for each step | |||
| └── alexnet.py # alexnet network definition | |||
| ├── eval.py # eval net | |||
| └── train.py # train net | |||
| ``` | |||
| ## Parameter configuration | |||
| Parameters for both training and evaluation can be set in src/config.py. | |||
| ## Running the example | |||
| ### Train | |||
| #### Usage | |||
| ``` | |||
| # distributed training | |||
| Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training | |||
| Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| ``` | |||
| # distribute training example | |||
| ./run_distribute_train.sh rank_table.json ~/dataset_path | |||
| # standalone training example | |||
| ./run_standalone_train.sh ~/dataset_path | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| #### Result | |||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. | |||
| ``` | |||
| epoch: 1 step: 1, loss is 2.3041954 | |||
| epoch: 1 step: 2, loss is 2.3079312 | |||
| ... | |||
| epoch: 1 step: 601, loss is 2.314184 | |||
| epoch: 1 step: 603, loss is 2.305666 | |||
| ... | |||
| ``` | |||
| ### Evaluation | |||
| #### Usage | |||
| ``` | |||
| # evaluation | |||
| Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| #### Launch | |||
| ``` | |||
| # evaluation example | |||
| ./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| ### Running on GPU | |||
| ``` | |||
| # distributed training example | |||
| ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training example | |||
| ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # infer example | |||
| ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| @@ -24,6 +24,7 @@ from mindspore.common import dtype as mstype | |||
| from .config import cfg | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): | |||
| """ | |||
| create dataset for train or test | |||
| @@ -66,6 +67,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe | |||
| cifar_ds = cifar_ds.repeat(repeat_size) | |||
| return cifar_ds | |||
| def _get_rank_info(): | |||
| """ | |||
| get rank size and rank id | |||
| @@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size | |||
| from .config import cfg | |||
| def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): | |||
| """ | |||
| create a train or eval imagenet dataset | |||
| @@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe | |||
| return ds | |||
| def _get_rank_info(): | |||
| """ | |||
| get rank size and rank id | |||
| @@ -1,98 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """ | |||
| Produce the dataset | |||
| """ | |||
| import os | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.vision.c_transforms as CV | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore.dataset.transforms.vision import Inter | |||
| from mindspore.common import dtype as mstype | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| from .config import cfg | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): | |||
| """ | |||
| create dataset for train or test | |||
| """ | |||
| if do_train: | |||
| data_path = os.path.join(data_path, "train") | |||
| else: | |||
| data_path = os.path.join(data_path, "test") | |||
| if target == 'Ascend': | |||
| device_num, rank_id = _get_rank_info() | |||
| elif target == 'GPU': | |||
| init("nccl") | |||
| rank_id = get_rank() | |||
| device_num = get_group_size() | |||
| else: | |||
| device_num = 1 | |||
| # define dataset | |||
| if device_num == 1: | |||
| mnist_ds = ds.MnistDataset(data_path) | |||
| else: | |||
| mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| resize_height, resize_width = cfg.image_height, cfg.image_width | |||
| rescale = 1.0 / 255.0 | |||
| shift = 0.0 | |||
| rescale_nml = 1 / 0.3081 | |||
| shift_nml = -1 * 0.1307 / 0.3081 | |||
| # define map operations | |||
| resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||
| rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||
| rescale_op = CV.Rescale(rescale, shift) | |||
| hwc2chw_op = CV.HWC2CHW() | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| # apply map operations on images | |||
| mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op) | |||
| # apply DatasetOps | |||
| buffer_size = 10000 | |||
| mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) | |||
| mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||
| mnist_ds = mnist_ds.repeat(repeat_size) | |||
| return mnist_ds | |||
| def _get_rank_info(): | |||
| """ | |||
| get rank size and rank id | |||
| """ | |||
| rank_size = int(os.environ.get("RANK_SIZE", 1)) | |||
| if rank_size > 1: | |||
| rank_size = get_group_size() | |||
| rank_id = get_rank() | |||
| else: | |||
| rank_size = 1 | |||
| rank_id = 0 | |||
| return rank_size, rank_id | |||
| @@ -18,6 +18,7 @@ eval alexnet according to model file: | |||
| python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | |||
| """ | |||
| import os | |||
| import argparse | |||
| from src.config import cfg | |||
| from src.dataset import create_dataset | |||
| @@ -33,15 +34,16 @@ if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') | |||
| parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ | |||
| parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved') | |||
| parser.add_argument('--checkpoint_path', type=str, default="./ckpt", help='if is test, must provide\ | |||
| path where the trained ckpt file') | |||
| parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'], | |||
| help='DataSet sink mode is True or False') | |||
| args = parser.parse_args() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| data_path = args.data_path | |||
| device_id = int(os.getenv('DEVICE_ID')) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=device_id) | |||
| data_path = args.dataset_path | |||
| dataset_sink_mode = args.dataset_sink_mode=='True' | |||
| network = AlexNet(cfg.num_classes) | |||
| @@ -50,15 +52,10 @@ if __name__ == "__main__": | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% if optimizer=='Lamb' %} | |||
| net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr) | |||
| {% elif optimizer=='Momentum' %} | |||
| net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) | |||
| {% endif %} | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
| model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()}) | |||
| print("============== Starting Testing ==============") | |||
| param_dict = load_checkpoint(args.ckpt_path) | |||
| param_dict = load_checkpoint(args.checkpoint_path) | |||
| load_param_into_net(network, param_dict) | |||
| do_train = False | |||
| ds_eval = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train, | |||
| @@ -16,7 +16,7 @@ | |||
| if [ $# != 2 ] && [ $# != 3 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| @@ -31,15 +31,15 @@ get_real_path(){ | |||
| PATH1=$(get_real_path $1) | |||
| PATH2=$(get_real_path $2) | |||
| if [ ! -d $PATH1 ] | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| exit 1 | |||
| fi | |||
| if [ ! -f $PATH2 ] | |||
| if [ ! -d $PATH2 ] | |||
| then | |||
| echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file" | |||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||
| exit 1 | |||
| fi | |||
| @@ -56,15 +56,15 @@ fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH2 | |||
| export RANK_TABLE_FILE=$PATH2 | |||
| rank_start=$((DEVICE_NUM * SERVER_ID)) | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| for((i=0; i<DEVICE_NUM; i++)) | |||
| start_id=0 | |||
| for((i=start_id; i<DEVICE_NUM + start_id; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$((rank_start + i)) | |||
| export RANK_ID=$((i - start_id)) | |||
| rm -rf ./train_parallel$i | |||
| mkdir ./train_parallel$i | |||
| cp ../*.py ./train_parallel$i | |||
| @@ -75,12 +75,12 @@ do | |||
| env > env.log | |||
| if [ $# == 2 ] | |||
| then | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log & | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log & | |||
| fi | |||
| if [ $# == 3 ] | |||
| then | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log & | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log & | |||
| fi | |||
| cd .. | |||
| @@ -1,53 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=4 | |||
| export RANK_SIZE=4 | |||
| rm -rf ./train_parallel | |||
| mkdir ./train_parallel | |||
| cp ../*.py ./train_parallel | |||
| cp *.sh ./train_parallel | |||
| cp -r ../src ./train_parallel | |||
| cd ./train_parallel || exit | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| @@ -0,0 +1,75 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] && [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| PATH2=$(get_real_path $2) | |||
| fi | |||
| if [ $# == 2 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=4 | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| rm -rf ./train_parallel | |||
| mkdir ./train_parallel | |||
| cp ../*.py ./train_parallel | |||
| cp *.sh ./train_parallel | |||
| cp -r ../src ./train_parallel | |||
| cd ./train_parallel || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 1 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | |||
| fi | |||
| @@ -61,6 +61,6 @@ cp *.sh ./eval | |||
| cp -r ../src ./eval | |||
| cd ./eval || exit | |||
| env > env.log | |||
| echo "start evaluation for device $DEVICE_ID" | |||
| echo "start evaluation" | |||
| python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & | |||
| cd .. | |||
| @@ -1,59 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] | |||
| then | |||
| echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=1 | |||
| export DEVICE_ID=0 | |||
| export RANK_ID=0 | |||
| export RANK_SIZE=1 | |||
| if [ -d "train" ]; | |||
| then | |||
| rm -rf ./train | |||
| fi | |||
| mkdir ./train | |||
| cp ../*.py ./train | |||
| cp *.sh ./train | |||
| cp -r ../src ./train | |||
| cd ./train || exit | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| cd .. | |||
| @@ -0,0 +1,77 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] && [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| PATH2=$(get_real_path $2) | |||
| fi | |||
| if [ $# == 2 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=1 | |||
| export DEVICE_ID=0 | |||
| export RANK_ID=0 | |||
| export RANK_SIZE=1 | |||
| if [ -d "train" ]; | |||
| then | |||
| rm -rf ./train | |||
| fi | |||
| mkdir ./train | |||
| cp ../*.py ./train | |||
| cp *.sh ./train | |||
| cp -r ../src ./train | |||
| cd ./train || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 1 ] | |||
| then | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | |||
| fi | |||
| cd .. | |||
| @@ -17,17 +17,20 @@ import mindspore.nn as nn | |||
| from mindspore.common.initializer import TruncatedNormal | |||
| from mindspore.ops import operations as P | |||
| def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"): | |||
| weight = weight_variable() | |||
| return nn.Conv2d(in_channels, out_channels, | |||
| kernel_size=kernel_size, stride=stride, padding=padding, | |||
| weight_init=weight, has_bias=False, pad_mode=pad_mode) | |||
| def fc_with_initialize(input_channels, out_channels): | |||
| weight = weight_variable() | |||
| bias = weight_variable() | |||
| return nn.Dense(input_channels, out_channels, weight, bias) | |||
| def weight_variable(): | |||
| return TruncatedNormal(0.02) | |||
| @@ -26,15 +26,21 @@ cfg = edict({ | |||
| {% elif dataset=='ImageNet' %} | |||
| 'num_classes': 1001, | |||
| {% endif %} | |||
| 'lr': 0.002, | |||
| {% if optimizer=='Momentum' %} | |||
| 'lr': 0.002, | |||
| "momentum": 0.9, | |||
| {% elif optimizer=='SGD' %} | |||
| 'lr': 0.1, | |||
| {% else %} | |||
| 'lr': 0.001, | |||
| {% endif %} | |||
| 'epoch_size': 1, | |||
| 'batch_size': 32, | |||
| 'loss_scale': 1024, | |||
| 'buffer_size': 1000, | |||
| 'image_height': 227, | |||
| 'image_width': 227, | |||
| 'weight_decay': 1e-4, | |||
| 'save_checkpoint': True, | |||
| 'save_checkpoint_epochs': 5, | |||
| 'keep_checkpoint_max': 10, | |||
| @@ -18,6 +18,7 @@ train alexnet and get network model files(.ckpt) : | |||
| python train.py --data_path /YourDataPath | |||
| """ | |||
| import os | |||
| import argparse | |||
| from src.config import cfg | |||
| from src.dataset import create_dataset | |||
| @@ -26,9 +27,10 @@ from src.alexnet import AlexNet | |||
| import mindspore.nn as nn | |||
| from mindspore import context | |||
| from mindspore import Tensor | |||
| from mindspore.train import Model | |||
| from mindspore.train import Model, ParallelMode | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore.train.loss_scale_manager import FixedLossScaleManager | |||
| from mindspore.parallel._auto_parallel_context import auto_parallel_context | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| @@ -41,7 +43,7 @@ if __name__ == "__main__": | |||
| parser.add_argument('--device_num', type=int, default=1, help='Device num') | |||
| parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend)') | |||
| parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') | |||
| parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved') | |||
| parser.add_argument('--pre_trained', type=str, default=None, help='Pre-trained checkpoint path') | |||
| parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'], | |||
| help='DataSet sink mode is True or False') | |||
| @@ -58,7 +60,6 @@ if __name__ == "__main__": | |||
| context.set_context(device_id=device_id, enable_auto_mixed_precision=True) | |||
| context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||
| mirror_mean=True) | |||
| auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) | |||
| init() | |||
| # GPU target | |||
| @@ -69,7 +70,7 @@ if __name__ == "__main__": | |||
| ckpt_save_dir = cfg.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" | |||
| data_path = args.data_path | |||
| data_path = args.dataset_path | |||
| do_train = True | |||
| ds_train = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train, | |||
| @@ -77,14 +78,14 @@ if __name__ == "__main__": | |||
| step_size = ds_train.get_dataset_size() | |||
| # define net | |||
| network = AlexNet(cfg.num_classes) | |||
| net = AlexNet(cfg.num_classes) | |||
| # init weight | |||
| if args.pre_trained: | |||
| param_dict = load_checkpoint(args.pre_trained) | |||
| load_param_into_net(network, param_dict) | |||
| load_param_into_net(net, param_dict) | |||
| else: | |||
| for _, cell in network.cells_and_names(): | |||
| for _, cell in net.cells_and_names(): | |||
| if isinstance(cell, nn.Conv2d): | |||
| cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), | |||
| cell.weight.default_input.shape, | |||
| @@ -93,20 +94,37 @@ if __name__ == "__main__": | |||
| cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), | |||
| cell.weight.default_input.shape, | |||
| cell.weight.default_input.dtype).to_tensor() | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| # define learning rate | |||
| lr = Tensor(get_lr(0, cfg.lr, cfg.epoch_size, ds_train.get_dataset_size())) | |||
| {% if optimizer=='Lamb' %} | |||
| net_opt = nn.Lamb(network.trainable_params(), learning_rate=lr) | |||
| {% elif optimizer=='Momentum' %} | |||
| net_opt = nn.Momentum(network.trainable_params(), learning_rate=lr, momentum=cfg.momentum) | |||
| {% endif %} | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
| # define loss, model | |||
| if target == "Ascend": | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% if optimizer=='Momentum' %} | |||
| opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum, | |||
| weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale) | |||
| {% else %} | |||
| opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=cfg.lr) | |||
| {% endif %} | |||
| loss_scale = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) | |||
| model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, | |||
| amp_level="O2", keep_batchnorm_fp32=False) | |||
| else: | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% if optimizer=='Momentum' %} | |||
| opt = nn.Momentum(net.trainable_params(), learning_rate=lr, momentum=cfg.momentum) | |||
| {% else %} | |||
| opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=lr) | |||
| {% endif %} | |||
| model = Model(net, loss, opt, metrics={"Accuracy": Accuracy()}) | |||
| # define callbacks | |||
| time_cb = TimeMonitor(data_size=step_size) | |||
| @@ -114,7 +132,7 @@ if __name__ == "__main__": | |||
| cb = [time_cb, loss_cb] | |||
| if cfg.save_checkpoint: | |||
| cfg_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * step_size, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| ckpt_cb = ModelCheckpoint(prefix="alexnet", directory=ckpt_save_dir, config=cfg_ck) | |||
| cb += [ckpt_cb] | |||
| @@ -0,0 +1,120 @@ | |||
| # LeNet Example | |||
| ## Description | |||
| These are examples of training LeNet with dataset in MindSpore. | |||
| ## Requirements | |||
| - Install [MindSpore](https://www.mindspore.cn/install/en). | |||
| - Download the dataset, the directory structure is as follows: | |||
| ``` | |||
| └─Data | |||
| ├─test | |||
| │ t10k-images.idx3-ubyte | |||
| │ t10k-labels.idx1-ubyte | |||
| │ | |||
| └─train | |||
| train-images.idx3-ubyte | |||
| train-labels.idx1-ubyte | |||
| ``` | |||
| ## Structure | |||
| ```shell | |||
| . | |||
| └──lenet | |||
| ├── README.md | |||
| ├── script | |||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||
| ├── run_eval.sh # launch evaluation | |||
| ├── run_standalone_train.sh # launch standalone training(1 pcs) | |||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) | |||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||
| ├── src | |||
| ├── config.py # parameter configuration | |||
| ├── dataset.py # data preprocessing | |||
| └── lenet.py # lenet network definition | |||
| ├── eval.py # eval net | |||
| └── train.py # train net | |||
| ``` | |||
| ## Parameter configuration | |||
| Parameters for both training and evaluation can be set in src/config.py. | |||
| ## Running the example | |||
| ### Train | |||
| #### Usage | |||
| ``` | |||
| # distributed training | |||
| Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training | |||
| Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| ``` | |||
| # distribute training example | |||
| ./run_distribute_train.sh rank_table.json ~/MNIST_data | |||
| # standalone training example | |||
| ./run_standalone_train.sh ~/MNIST_data | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| #### Result | |||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. | |||
| ``` | |||
| epoch: 1 step: 1, loss is 2.3041954 | |||
| epoch: 1 step: 2, loss is 2.3079312 | |||
| ... | |||
| epoch: 1 step: 601, loss is 2.314184 | |||
| epoch: 1 step: 603, loss is 2.305666 | |||
| ... | |||
| ``` | |||
| ### Evaluation | |||
| #### Usage | |||
| ``` | |||
| # evaluation | |||
| Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| #### Launch | |||
| ``` | |||
| # evaluation example | |||
| ./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| ### Running on GPU | |||
| ``` | |||
| # distributed training example | |||
| ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training example | |||
| ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # infer example | |||
| ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| @@ -17,6 +17,8 @@ | |||
| eval lenet according to model file: | |||
| python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | |||
| """ | |||
| import os | |||
| import argparse | |||
| import mindspore.nn as nn | |||
| @@ -37,11 +39,12 @@ if __name__ == "__main__": | |||
| help='path where the dataset is saved') | |||
| parser.add_argument('--checkpoint_path', type=str, default="", help='if mode is test, must provide\ | |||
| path where the trained ckpt file') | |||
| parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True') | |||
| parser.add_argument('--dataset_sink', action='store_true', help='enable dataset sink or not') | |||
| args = parser.parse_args() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| device_id = int(os.getenv('DEVICE_ID')) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=device_id) | |||
| network = LeNet5(cfg.num_classes) | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| @@ -49,12 +52,7 @@ if __name__ == "__main__": | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% if optimizer=='Lamb' %} | |||
| net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr) | |||
| {% elif optimizer=='Momentum' %} | |||
| net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) | |||
| {% endif %} | |||
| model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
| model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()}) | |||
| print("============== Starting Testing ==============") | |||
| param_dict = load_checkpoint(args.checkpoint_path) | |||
| @@ -63,5 +61,5 @@ if __name__ == "__main__": | |||
| do_train = False | |||
| ds_eval = create_dataset(data_path=data_path, do_train=do_train, batch_size=cfg.batch_size, | |||
| target=args.device_target) | |||
| acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) | |||
| acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink) | |||
| print("============== {} ==============".format(acc)) | |||
| @@ -57,6 +57,9 @@ cp *.sh ./train_parallel | |||
| cp -r ../src ./train_parallel | |||
| cd ./train_parallel || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 1 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| @@ -61,6 +61,6 @@ cp *.sh ./eval | |||
| cp -r ../src ./eval | |||
| cd ./eval || exit | |||
| env > env.log | |||
| echo "start evaluation for device $DEVICE_ID" | |||
| echo "start evaluation" | |||
| python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & | |||
| cd .. | |||
| @@ -65,6 +65,9 @@ cp *.sh ./train | |||
| cp -r ../src ./train | |||
| cd ./train || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 1 ] | |||
| then | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| @@ -18,21 +18,15 @@ network config setting, will be used in train.py | |||
| from easydict import EasyDict as edict | |||
| cfg = edict({ | |||
| {% if dataset=='MNIST' %} | |||
| 'num_classes': 10, | |||
| {% elif dataset=='Cifar10' %} | |||
| 'num_classes': 10, | |||
| {% elif dataset=='ImageNet' %} | |||
| 'num_classes': 1001, | |||
| {% endif %} | |||
| {% if dataset=='Momentum' %} | |||
| {% if optimizer=='Momentum' %} | |||
| 'lr': 0.01, | |||
| "momentum": 0.9, | |||
| {% elif optimizer=='SGD' %} | |||
| 'lr': 0.1, | |||
| {% else %} | |||
| 'lr': 0.001, | |||
| {% endif %} | |||
| {% if optimizer=='Momentum' %} | |||
| "momentum": 0.9, | |||
| {% endif %} | |||
| 'epoch_size': 1, | |||
| 'batch_size': 32, | |||
| 'buffer_size': 1000, | |||
| @@ -48,6 +48,7 @@ if __name__ == "__main__": | |||
| if args.device_target == "CPU": | |||
| args.dataset_sink = False | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| ckpt_save_dir = './' | |||
| if args.run_distribute: | |||
| if args.device_target == 'Ascend': | |||
| @@ -62,7 +63,6 @@ if __name__ == "__main__": | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||
| mirror_mean=True) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| data_path = args.dataset_path | |||
| do_train = True | |||
| @@ -79,10 +79,10 @@ if __name__ == "__main__": | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% if optimizer=='Lamb' %} | |||
| net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr) | |||
| {% elif optimizer=='Momentum' %} | |||
| {% if optimizer=='Momentum' %} | |||
| net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) | |||
| {% else %} | |||
| net_opt = nn.{{ optimizer }}(network.trainable_params(), learning_rate=cfg.lr) | |||
| {% endif %} | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| @@ -0,0 +1,136 @@ | |||
| # ResNet50 Example | |||
| ## Description | |||
| These are examples of training ResNet50 with dataset in MindSpore. | |||
| ## Requirements | |||
| - Install [MindSpore](https://www.mindspore.cn/install/en). | |||
| - Download the dataset, the directory structure is as follows: | |||
| {% if dataset=='Cifar10' %} | |||
| CIFAR-10 | |||
| ``` | |||
| └─Data | |||
| ├─test | |||
| │ cifar-10-verify-bin | |||
| │ | |||
| └─train | |||
| cifar-10-batches-bin | |||
| ``` | |||
| {% elif dataset=='ImageNet' %} | |||
| ImageNet | |||
| ``` | |||
| └─Data | |||
| ├─test | |||
| │ validation_preprocess | |||
| │ | |||
| └─train | |||
| ilsvrc | |||
| ``` | |||
| {% endif %} | |||
| ## Structure | |||
| ```shell | |||
| . | |||
| └──resnet50 | |||
| ├── README.md | |||
| ├── script | |||
| ├── run_distribute_train.sh # launch distributed training(8 pcs) | |||
| ├── run_eval.sh # launch evaluation | |||
| ├── run_standalone_train.sh # launch standalone training(1 pcs) | |||
| ├── run_distribute_train_gpu.sh # launch gpu distributed training(4 pcs) | |||
| ├── run_eval_gpu.sh # launch gpu evaluation | |||
| └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) | |||
| ├── src | |||
| ├── config.py # parameter configuration | |||
| ├── crossentropy.py # loss definition for ImageNet2012 dataset | |||
| ├── dataset.py # data preprocessing | |||
| ├── lr_generator.py # generate learning rate for each step | |||
| └── resnet50.py # resNet50 network definition | |||
| ├── eval.py # eval net | |||
| └── train.py # train net | |||
| ``` | |||
| ## Parameter configuration | |||
| Parameters for both training and evaluation can be set in src/config.py. | |||
| ## Running the example | |||
| ### Train | |||
| #### Usage | |||
| ``` | |||
| # distributed training | |||
| Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training | |||
| Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| ``` | |||
| #### Launch | |||
| ``` | |||
| # distribute training example | |||
| ./run_distribute_train.sh rank_table.json ~/dataset_path | |||
| # standalone training example | |||
| ./run_standalone_train.sh ~/dataset_path | |||
| ``` | |||
| > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). | |||
| #### Result | |||
| Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. | |||
| ``` | |||
| epoch: 1 step: 1, loss is 2.3041954 | |||
| epoch: 1 step: 2, loss is 2.3079312 | |||
| ... | |||
| epoch: 1 step: 601, loss is 2.314184 | |||
| epoch: 1 step: 603, loss is 2.305666 | |||
| ... | |||
| ``` | |||
| ### Evaluation | |||
| #### Usage | |||
| ``` | |||
| # evaluation | |||
| Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| #### Launch | |||
| ``` | |||
| # evaluation example | |||
| ./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt | |||
| ``` | |||
| > checkpoint can be produced in training process. | |||
| ### Running on GPU | |||
| ``` | |||
| # distributed training example | |||
| ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # standalone training example | |||
| ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) | |||
| # infer example | |||
| ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] | |||
| ``` | |||
| @@ -71,6 +71,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe | |||
| cifar_ds = cifar_ds.repeat(repeat_size) | |||
| return cifar_ds | |||
| def _get_rank_info(): | |||
| """ | |||
| get rank size and rank id | |||
| @@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size | |||
| from .config import cfg | |||
| def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): | |||
| """ | |||
| create a train or eval imagenet dataset | |||
| @@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe | |||
| return ds | |||
| def _get_rank_info(): | |||
| """ | |||
| get rank size and rank id | |||
| @@ -1,98 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """ | |||
| Produce the dataset | |||
| """ | |||
| import os | |||
| import mindspore.dataset as ds | |||
| import mindspore.dataset.transforms.vision.c_transforms as CV | |||
| import mindspore.dataset.transforms.c_transforms as C | |||
| from mindspore.dataset.transforms.vision import Inter | |||
| from mindspore.common import dtype as mstype | |||
| from mindspore.communication.management import init, get_rank, get_group_size | |||
| from .config import cfg | |||
| def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target='Ascend'): | |||
| """ | |||
| create dataset for train or test | |||
| """ | |||
| if do_train: | |||
| data_path = os.path.join(data_path, "train") | |||
| else: | |||
| data_path = os.path.join(data_path, "test") | |||
| if target == 'Ascend': | |||
| device_num, rank_id = _get_rank_info() | |||
| elif target == 'GPU': | |||
| init("nccl") | |||
| rank_id = get_rank() | |||
| device_num = get_group_size() | |||
| else: | |||
| device_num = 1 | |||
| # define dataset | |||
| if device_num == 1: | |||
| mnist_ds = ds.MnistDataset(data_path) | |||
| else: | |||
| mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True, | |||
| num_shards=device_num, shard_id=rank_id) | |||
| resize_height, resize_width = cfg.image_height, cfg.image_width | |||
| rescale = 1.0 / 255.0 | |||
| shift = 0.0 | |||
| rescale_nml = 1 / 0.3081 | |||
| shift_nml = -1 * 0.1307 / 0.3081 | |||
| # define map operations | |||
| resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||
| rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||
| rescale_op = CV.Rescale(rescale, shift) | |||
| hwc2chw_op = CV.HWC2CHW() | |||
| type_cast_op = C.TypeCast(mstype.int32) | |||
| # apply map operations on images | |||
| mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op) | |||
| mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op) | |||
| # apply DatasetOps | |||
| buffer_size = 10000 | |||
| mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) | |||
| mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||
| mnist_ds = mnist_ds.repeat(repeat_size) | |||
| return mnist_ds | |||
| def _get_rank_info(): | |||
| """ | |||
| get rank size and rank id | |||
| """ | |||
| rank_size = int(os.environ.get("RANK_SIZE", 1)) | |||
| if rank_size > 1: | |||
| rank_size = get_group_size() | |||
| rank_id = get_rank() | |||
| else: | |||
| rank_size = 1 | |||
| rank_id = 0 | |||
| return rank_size, rank_id | |||
| @@ -16,7 +16,7 @@ | |||
| if [ $# != 2 ] && [ $# != 3 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| @@ -31,15 +31,15 @@ get_real_path(){ | |||
| PATH1=$(get_real_path $1) | |||
| PATH2=$(get_real_path $2) | |||
| if [ ! -d $PATH1 ] | |||
| if [ ! -f $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| echo "error: RANK_TABLE_FILE=$PATH1 is not a file" | |||
| exit 1 | |||
| fi | |||
| if [ ! -f $PATH2 ] | |||
| if [ ! -d $PATH2 ] | |||
| then | |||
| echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file" | |||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||
| exit 1 | |||
| fi | |||
| @@ -56,16 +56,15 @@ fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH2 | |||
| export RANK_TABLE_FILE=$PATH2 | |||
| export SERVER_ID=0 | |||
| rank_start=$((DEVICE_NUM * SERVER_ID)) | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 | |||
| export RANK_TABLE_FILE=$PATH1 | |||
| for((i=0; i<DEVICE_NUM; i++)) | |||
| start_id=0 | |||
| for((i=start_id; i<DEVICE_NUM + start_id; i++)) | |||
| do | |||
| export DEVICE_ID=$i | |||
| export RANK_ID=$((rank_start + i)) | |||
| export RANK_ID=$((i - start_id)) | |||
| rm -rf ./train_parallel$i | |||
| mkdir ./train_parallel$i | |||
| cp ../*.py ./train_parallel$i | |||
| @@ -76,12 +75,12 @@ do | |||
| env > env.log | |||
| if [ $# == 2 ] | |||
| then | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log & | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log & | |||
| fi | |||
| if [ $# == 3 ] | |||
| then | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log & | |||
| python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log & | |||
| fi | |||
| cd .. | |||
| @@ -1,53 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=4 | |||
| export RANK_SIZE=4 | |||
| rm -rf ./train_parallel | |||
| mkdir ./train_parallel | |||
| cp ../*.py ./train_parallel | |||
| cp *.sh ./train_parallel | |||
| cp -r ../src ./train_parallel | |||
| cd ./train_parallel || exit | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| @@ -0,0 +1,76 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] && [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| PATH2=$(get_real_path $2) | |||
| fi | |||
| if [ $# == 2 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=4 | |||
| export RANK_SIZE=$DEVICE_NUM | |||
| rm -rf ./train_parallel | |||
| mkdir ./train_parallel | |||
| cp ../*.py ./train_parallel | |||
| cp *.sh ./train_parallel | |||
| cp -r ../src ./train_parallel | |||
| cd ./train_parallel || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 1 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python train.py --run_distribute=True \ | |||
| --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH3 &> log & | |||
| fi | |||
| @@ -61,6 +61,6 @@ cp *.sh ./eval | |||
| cp -r ../src ./eval | |||
| cd ./eval || exit | |||
| env > env.log | |||
| echo "start evaluation for device $DEVICE_ID" | |||
| echo "start evaluation" | |||
| python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & | |||
| cd .. | |||
| @@ -1,59 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] | |||
| then | |||
| echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=1 | |||
| export DEVICE_ID=0 | |||
| export RANK_ID=0 | |||
| export RANK_SIZE=1 | |||
| if [ -d "train" ]; | |||
| then | |||
| rm -rf ./train | |||
| fi | |||
| mkdir ./train | |||
| cp ../*.py ./train | |||
| cp *.sh ./train | |||
| cp -r ../src ./train | |||
| cd ./train || exit | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| cd .. | |||
| @@ -0,0 +1,77 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| if [ $# != 1 ] && [ $# != 2 ] | |||
| then | |||
| echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| PATH1=$(get_real_path $1) | |||
| if [ ! -d $PATH1 ] | |||
| then | |||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||
| exit 1 | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| PATH2=$(get_real_path $2) | |||
| fi | |||
| if [ $# == 2 ] && [ ! -f $PATH2 ] | |||
| then | |||
| echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" | |||
| exit 1 | |||
| fi | |||
| ulimit -u unlimited | |||
| export DEVICE_NUM=1 | |||
| export DEVICE_ID=0 | |||
| export RANK_ID=0 | |||
| export RANK_SIZE=1 | |||
| if [ -d "train" ]; | |||
| then | |||
| rm -rf ./train | |||
| fi | |||
| mkdir ./train | |||
| cp ../*.py ./train | |||
| cp *.sh ./train | |||
| cp -r ../src ./train | |||
| cd ./train || exit | |||
| echo "start training" | |||
| env > env.log | |||
| if [ $# == 1 ] | |||
| then | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & | |||
| fi | |||
| if [ $# == 2 ] | |||
| then | |||
| python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & | |||
| fi | |||
| cd .. | |||
| @@ -28,7 +28,13 @@ cfg = ed({ | |||
| "batch_size": 32, | |||
| "loss_scale": 1024, | |||
| {% if optimizer=='Momentum' %} | |||
| "lr": 0.01, | |||
| "momentum": 0.9, | |||
| "lr": 0.01, | |||
| {% elif optimizer=='SGD' %} | |||
| 'lr': 0.1, | |||
| {% else %} | |||
| 'lr': 0.001, | |||
| {% endif %} | |||
| "image_height": 224, | |||
| "image_width": 224, | |||
| @@ -48,7 +54,6 @@ cfg = ed({ | |||
| {% endif %} | |||
| "use_label_smooth": True, | |||
| "label_smooth_factor": 0.1, | |||
| "lr": 0.01, | |||
| "lr_init": 0.01, | |||
| "lr_end": 0.00001, | |||
| "lr_max": 0.1 | |||
| @@ -112,12 +112,11 @@ if __name__ == '__main__': | |||
| lr = Tensor(lr) | |||
| # define opt | |||
| {% if optimizer=='Lamb' %} | |||
| opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, | |||
| weight_decay=cfg.weight_decay) | |||
| {% elif optimizer=='Momentum' %} | |||
| {% if optimizer=='Momentum' %} | |||
| opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum, | |||
| weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale) | |||
| {% else %} | |||
| opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=cfg.lr) | |||
| {% endif %} | |||
| # define loss, model | |||
| @@ -125,7 +124,7 @@ if __name__ == '__main__': | |||
| {% if dataset=='ImageNet' %} | |||
| if not cfg.use_label_smooth: | |||
| cfg.label_smooth_factor = 0.0 | |||
| loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| loss = CrossEntLambropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) | |||
| {% else %} | |||
| {% if loss=='SoftmaxCrossEntropyWithLogits' %} | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||
| @@ -143,10 +142,10 @@ if __name__ == '__main__': | |||
| {% elif loss=='SoftmaxCrossEntropyExpand' %} | |||
| loss = nn.SoftmaxCrossEntropyExpand(sparse=True) | |||
| {% endif %} | |||
| {% if optimizer=='Lamb' %} | |||
| opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) | |||
| {% elif optimizer=='Momentum' %} | |||
| {% if optimizer=='Momentum' %} | |||
| opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum) | |||
| {% else %} | |||
| opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) | |||
| {% endif %} | |||
| model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) | |||
| @@ -15,4 +15,4 @@ class Network(GenericNetwork): | |||
| name = 'alexnet' | |||
| supported_datasets = ['Cifar10', 'ImageNet'] | |||
| supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand'] | |||
| supported_optimizers = ['Momentum', 'Lamb'] | |||
| supported_optimizers = ['Momentum', 'Adam', 'SGD'] | |||
| @@ -21,4 +21,4 @@ class Network(GenericNetwork): | |||
| name = 'lenet' | |||
| supported_datasets = ['MNIST'] | |||
| supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand'] | |||
| supported_optimizers = ['Momentum', 'Lamb'] | |||
| supported_optimizers = ['Momentum', 'Adam', 'SGD'] | |||
| @@ -6,7 +6,7 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """alexnet module.""" | |||
| """Resnet50 module.""" | |||
| from mindinsight.wizard.network.generic_network import GenericNetwork | |||
| @@ -15,4 +15,4 @@ class Network(GenericNetwork): | |||
| name = 'resnet50' | |||
| supported_datasets = ['Cifar10', 'ImageNet'] | |||
| supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand'] | |||
| supported_optimizers = ['Momentum', 'Lamb'] | |||
| supported_optimizers = ['Momentum', 'Adam', 'SGD'] | |||