fix alexnet and rename .sh

5 years ago · b92b4ded56
--- a/mindinsight/wizard/base/templates.py
+++ b/mindinsight/wizard/base/templates.py
@@ -29,7 +29,7 @@ def render_template(template_file_path, context):

 class TemplateManager:
    """BaseNetwork code generator."""
    replace_template_suffixes = [('.py-tpl', '.py')]
    replace_template_suffixes = [('.py-tpl', '.py'), ('.sh-tpl', '.sh'), ('.md-tpl', '.md')]

    def __init__(self, template_base_dir, exclude_dirs=None, exclude_files=None):
        self.template_base_dir = template_base_dir
@@ -70,7 +70,7 @@ class TemplateManager:
        """Generate the network files."""
        source_files = []
        template_files = self.get_template_files()
        extensions = tuple(options.get('extensions', '.py'))
        extensions = tuple([new_extension for _, new_extension in self.replace_template_suffixes])
        for template_file in template_files:
            new_file_path = template_file
            template_file_path = template_file
--- a/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl
@@ -0,0 +1,135 @@
 # AlexNet Example

 ## Description

 These are examples of training AlexNet with dataset in MindSpore.

 ## Requirements

 - Install [MindSpore](https://www.mindspore.cn/install/en).

 - Download the dataset, the directory structure is as follows:

 {% if dataset=='Cifar10' %}
 CIFAR-10

 ```
 └─Data
    ├─test
    │      cifar-10-verify-bin
    │
    └─train
           cifar-10-batches-bin
 ```

 {% elif dataset=='ImageNet' %}
 ImageNet

 ```
 └─Data
    ├─test
    │       validation_preprocess
    │
    └─train
            ilsvrc
 ```
 {% endif %}

 ## Structure

 ```shell
 .
 └──alexnet
  ├── README.md
  ├── script
    ├── run_distribute_train.sh            # launch distributed training(8 pcs)
    ├── run_eval.sh                        # launch evaluation
    ├── run_standalone_train.sh            # launch standalone training(1 pcs)
    ├── run_distribute_train_gpu.sh        # launch gpu distributed training(4 pcs)
    ├── run_eval_gpu.sh                    # launch gpu evaluation
    └── run_standalone_train_gpu.sh        # launch gpu standalone training(1 pcs)
  ├── src
    ├── config.py                          # parameter configuration
    ├── dataset.py                         # data preprocessing
    ├── generator_lr.py                    # generate learning rate for each step
    └── alexnet.py                         # alexnet network definition
  ├── eval.py                              # eval net
  └── train.py                             # train net
 ```


 ## Parameter configuration

 Parameters for both training and evaluation can be set in src/config.py.


 ## Running the example

 ### Train

 #### Usage

 ```
 # distributed training
 Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # standalone training
 Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 ```


 #### Launch

 ```
 # distribute training example
 ./run_distribute_train.sh rank_table.json ~/dataset_path

 # standalone training example
 ./run_standalone_train.sh ~/dataset_path
 ```

 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).

 #### Result

 Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.

 ```
 epoch: 1 step: 1, loss is 2.3041954
 epoch: 1 step: 2, loss is 2.3079312
 ...
 epoch: 1 step: 601, loss is 2.314184
 epoch: 1 step: 603, loss is 2.305666
 ...
 ```

 ### Evaluation

 #### Usage

 ```
 # evaluation
 Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```

 #### Launch

 ```
 # evaluation example
 ./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt
 ```

 > checkpoint can be produced in training process.


 ### Running on GPU
 ```
 # distributed training example
 ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # standalone training example
 ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # infer example
 ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
--- a/mindinsight/wizard/conf/templates/network/alexnet/dataset/cifar10/dataset.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/dataset/cifar10/dataset.py-tpl
@@ -24,6 +24,7 @@ from mindspore.common import dtype as mstype
 from .config import cfg
 from mindspore.communication.management import init, get_rank, get_group_size


 def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
    """
    create dataset for train or test
@@ -66,6 +67,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe
    cifar_ds = cifar_ds.repeat(repeat_size)
    return cifar_ds


 def _get_rank_info():
    """
    get rank size and rank id
--- a/mindinsight/wizard/conf/templates/network/alexnet/dataset/imagenet/dataset.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/dataset/imagenet/dataset.py-tpl
@@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size

 from .config import cfg


 def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
    """
    create a train or eval imagenet dataset
@@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe

    return ds


 def _get_rank_info():
    """
    get rank size and rank id
--- a/mindinsight/wizard/conf/templates/network/alexnet/dataset/mnist/dataset.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/dataset/mnist/dataset.py-tpl
@@ -1,98 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Produce the dataset
 """
 import os

 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.transforms.vision import Inter
 from mindspore.common import dtype as mstype
 from mindspore.communication.management import init, get_rank, get_group_size

 from .config import cfg
 from mindspore.communication.management import init, get_rank, get_group_size

 def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
    """
    create dataset for train or test
    """

    if do_train:
        data_path = os.path.join(data_path, "train")
    else:
        data_path = os.path.join(data_path, "test")

    if target == 'Ascend':
        device_num, rank_id = _get_rank_info()
    elif target == 'GPU':
        init("nccl")
        rank_id = get_rank()
        device_num = get_group_size()
    else:
        device_num = 1

    # define dataset
    if device_num == 1:
        mnist_ds = ds.MnistDataset(data_path)
    else:
        mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True,
                                   num_shards=device_num, shard_id=rank_id)

    resize_height, resize_width = cfg.image_height, cfg.image_width
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081

    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    # apply map operations on images
    mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op)

    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)

    return mnist_ds


 def _get_rank_info():
    """
    get rank size and rank id
    """
    rank_size = int(os.environ.get("RANK_SIZE", 1))

    if rank_size > 1:
        rank_size = get_group_size()
        rank_id = get_rank()
    else:
        rank_size = 1
        rank_id = 0

    return rank_size, rank_id
--- a/mindinsight/wizard/conf/templates/network/alexnet/eval.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/eval.py-tpl
@@ -18,6 +18,7 @@ eval alexnet according to model file:
 python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
 """

 import os
 import argparse
 from src.config import cfg
 from src.dataset import create_dataset
@@ -33,15 +34,16 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='MindSpore AlexNet Example')
    parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'],
                        help='device where the code will be implemented (default: Ascend)')
    parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved')
    parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\
    parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved')
    parser.add_argument('--checkpoint_path', type=str, default="./ckpt", help='if is test, must provide\
                        path where the trained ckpt file')
    parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'],
                        help='DataSet sink mode is True or False')
    args = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    data_path = args.data_path
    device_id = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=device_id)
    data_path = args.dataset_path
    dataset_sink_mode = args.dataset_sink_mode=='True'

    network = AlexNet(cfg.num_classes)
@@ -50,15 +52,10 @@ if __name__ == "__main__":
    {% elif loss=='SoftmaxCrossEntropyExpand' %}
    net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
    {% endif %}
    {% if optimizer=='Lamb' %}
    net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr)
    {% elif optimizer=='Momentum' %}
    net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum)
    {% endif %}
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()})

    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(args.ckpt_path)
    param_dict = load_checkpoint(args.checkpoint_path)
    load_param_into_net(network, param_dict)
    do_train = False
    ds_eval = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train,
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl
@@ -16,7 +16,7 @@

 if [ $# != 2 ] && [ $# != 3 ]
 then
 	echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
 	echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi

@@ -31,15 +31,15 @@ get_real_path(){
 PATH1=$(get_real_path $1)
 PATH2=$(get_real_path $2)

 if [ ! -d $PATH1 ]
 if [ ! -f $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
    echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
 exit 1
 fi

 if [ ! -f $PATH2 ]
 if [ ! -d $PATH2 ]
 then
    echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file"
    echo "error: DATASET_PATH=$PATH2 is not a directory"
 exit 1
 fi

@@ -56,15 +56,15 @@ fi

 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
 export MINDSPORE_HCCL_CONFIG_PATH=$PATH2
 export RANK_TABLE_FILE=$PATH2
 rank_start=$((DEVICE_NUM * SERVER_ID))
 export RANK_SIZE=$DEVICE_NUM
 export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
 export RANK_TABLE_FILE=$PATH1

 for((i=0; i<DEVICE_NUM; i++))
 start_id=0
 for((i=start_id; i<DEVICE_NUM + start_id; i++))
 do
    export DEVICE_ID=$i
    export RANK_ID=$((rank_start + i))
    export RANK_ID=$((i - start_id))
    rm -rf ./train_parallel$i
    mkdir ./train_parallel$i
    cp ../*.py ./train_parallel$i
@@ -75,12 +75,12 @@ do
    env > env.log
    if [ $# == 2 ]
    then
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log &
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log &
    fi

    if [ $# == 3 ]
    then
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log &
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log &
    fi

    cd ..
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh
@@ -1,53 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ]
 then
 	echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]"
 exit 1
 fi

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)

 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi


 ulimit -u unlimited
 export DEVICE_NUM=4
 export RANK_SIZE=4

 rm -rf ./train_parallel
 mkdir ./train_parallel
 cp ../*.py ./train_parallel
 cp *.sh ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit

 mpirun --allow-run-as-root -n $RANK_SIZE \
 python train.py --run_distribute=True \
 --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh-tpl
@@ -0,0 +1,75 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ] && [ $# != 2 ]
 then
 	echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)

 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi

 if [ $# == 2 ]
 then
    PATH2=$(get_real_path $2)
 fi

 if [ $# == 2 ] && [ ! -f $PATH2 ]
 then
    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
 exit 1
 fi

 ulimit -u unlimited
 export DEVICE_NUM=4
 export RANK_SIZE=$DEVICE_NUM

 rm -rf ./train_parallel
 mkdir ./train_parallel
 cp ../*.py ./train_parallel
 cp *.sh ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit
 echo "start training"
 env > env.log

 if [ $# == 1 ]
 then
  mpirun --allow-run-as-root -n $RANK_SIZE \
  python train.py --run_distribute=True \
  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
 fi

 if [ $# == 2 ]
 then
  mpirun --allow-run-as-root -n $RANK_SIZE \
  python train.py --run_distribute=True \
  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
 fi
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh-tpl
@@ -61,6 +61,6 @@ cp *.sh ./eval
 cp -r ../src ./eval
 cd ./eval || exit
 env > env.log
 echo "start evaluation for device $DEVICE_ID"
 echo "start evaluation"
 python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
 cd ..
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh
@@ -1,59 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ]
 then
    echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]"
 exit 1
 fi


 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)


 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi


 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=0
 export RANK_ID=0
 export RANK_SIZE=1

 if [ -d "train" ];
 then
    rm -rf ./train
 fi
 mkdir ./train
 cp ../*.py ./train
 cp *.sh ./train
 cp -r ../src ./train
 cd ./train || exit
 python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &

 cd ..
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh-tpl
@@ -0,0 +1,77 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ] && [ $# != 2 ]
 then
    echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)

 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi

 if [ $# == 2 ]
 then
    PATH2=$(get_real_path $2)
 fi

 if [ $# == 2 ] && [ ! -f $PATH2 ]
 then
    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
 exit 1
 fi

 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=0
 export RANK_ID=0
 export RANK_SIZE=1

 if [ -d "train" ];
 then
    rm -rf ./train
 fi
 mkdir ./train
 cp ../*.py ./train
 cp *.sh ./train
 cp -r ../src ./train
 cd ./train || exit
 echo "start training"
 env > env.log

 if [ $# == 1 ]
 then
    python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
 fi

 if [ $# == 2 ]
 then
    python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
 fi
 cd ..
--- a/mindinsight/wizard/conf/templates/network/alexnet/src/alexnet.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/src/alexnet.py-tpl
@@ -17,17 +17,20 @@ import mindspore.nn as nn
 from mindspore.common.initializer import TruncatedNormal
 from mindspore.ops import operations as P


 def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"):
    weight = weight_variable()
    return nn.Conv2d(in_channels, out_channels,
                     kernel_size=kernel_size, stride=stride, padding=padding,
                     weight_init=weight, has_bias=False, pad_mode=pad_mode)


 def fc_with_initialize(input_channels, out_channels):
    weight = weight_variable()
    bias = weight_variable()
    return nn.Dense(input_channels, out_channels, weight, bias)


 def weight_variable():
    return TruncatedNormal(0.02)

--- a/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl
@@ -26,15 +26,21 @@ cfg = edict({
    {% elif dataset=='ImageNet' %}
    'num_classes': 1001,
    {% endif %}
    'lr': 0.002,
    {% if optimizer=='Momentum' %}
    'lr': 0.002,
    "momentum": 0.9,
    {% elif optimizer=='SGD' %}
    'lr': 0.1,
    {% else %}
    'lr': 0.001,
    {% endif %}
    'epoch_size': 1,
    'batch_size': 32,
    'loss_scale': 1024,
    'buffer_size': 1000,
    'image_height': 227,
    'image_width': 227,
    'weight_decay': 1e-4,
    'save_checkpoint': True,
    'save_checkpoint_epochs': 5,
    'keep_checkpoint_max': 10,
--- a/mindinsight/wizard/conf/templates/network/alexnet/train.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/train.py-tpl
@@ -18,6 +18,7 @@ train alexnet and get network model files(.ckpt) :
 python train.py --data_path /YourDataPath
 """

 import os
 import argparse
 from src.config import cfg
 from src.dataset import create_dataset
@@ -26,9 +27,10 @@ from src.alexnet import AlexNet
 import mindspore.nn as nn
 from mindspore import context
 from mindspore import Tensor
 from mindspore.train import Model
 from mindspore.train import Model, ParallelMode
 from mindspore.nn.metrics import Accuracy
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train.loss_scale_manager import FixedLossScaleManager
 from mindspore.parallel._auto_parallel_context import auto_parallel_context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.communication.management import init, get_rank, get_group_size
@@ -41,7 +43,7 @@ if __name__ == "__main__":
    parser.add_argument('--device_num', type=int, default=1, help='Device num')
    parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
                        help='device where the code will be implemented (default: Ascend)')
    parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved')
    parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved')
    parser.add_argument('--pre_trained', type=str, default=None, help='Pre-trained checkpoint path')
    parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'],
                        help='DataSet sink mode is True or False')
@@ -58,7 +60,6 @@ if __name__ == "__main__":
            context.set_context(device_id=device_id, enable_auto_mixed_precision=True)
            context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                              mirror_mean=True)
            auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])

            init()
        # GPU target
@@ -69,7 +70,7 @@ if __name__ == "__main__":
            ckpt_save_dir = cfg.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"


    data_path = args.data_path
    data_path = args.dataset_path
    do_train = True

    ds_train = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train,
@@ -77,14 +78,14 @@ if __name__ == "__main__":
    step_size = ds_train.get_dataset_size()

    # define net
    network = AlexNet(cfg.num_classes)
    net = AlexNet(cfg.num_classes)

    # init weight
    if args.pre_trained:
        param_dict = load_checkpoint(args.pre_trained)
        load_param_into_net(network, param_dict)
        load_param_into_net(net, param_dict)
    else:
        for _, cell in network.cells_and_names():
        for _, cell in net.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
                                                                    cell.weight.default_input.shape,
@@ -93,20 +94,37 @@ if __name__ == "__main__":
                cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
                                                                    cell.weight.default_input.shape,
                                                                    cell.weight.default_input.dtype).to_tensor()


    {% if loss=='SoftmaxCrossEntropyWithLogits' %}
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
    {% elif loss=='SoftmaxCrossEntropyExpand' %}
    net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
    {% endif %}
    # define learning rate
    lr = Tensor(get_lr(0, cfg.lr, cfg.epoch_size, ds_train.get_dataset_size()))
    {% if optimizer=='Lamb' %}
    net_opt = nn.Lamb(network.trainable_params(), learning_rate=lr)
    {% elif optimizer=='Momentum' %}
    net_opt = nn.Momentum(network.trainable_params(), learning_rate=lr, momentum=cfg.momentum)
    {% endif %}
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    # define loss, model
    if target == "Ascend":
        {% if loss=='SoftmaxCrossEntropyWithLogits' %}
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
        {% elif loss=='SoftmaxCrossEntropyExpand' %}
        loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
        {% endif %}
        {% if optimizer=='Momentum' %}
        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum,
                          weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale)
        {% else %}
        opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=cfg.lr)
        {% endif %}
        loss_scale = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False)
        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
                      amp_level="O2", keep_batchnorm_fp32=False)
    else:
        {% if loss=='SoftmaxCrossEntropyWithLogits' %}
        loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
        {% elif loss=='SoftmaxCrossEntropyExpand' %}
        loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
        {% endif %}
        {% if optimizer=='Momentum' %}
        opt = nn.Momentum(net.trainable_params(), learning_rate=lr, momentum=cfg.momentum)
        {% else %}
        opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=lr)
        {% endif %}
        model = Model(net, loss, opt, metrics={"Accuracy": Accuracy()})

    # define callbacks
    time_cb = TimeMonitor(data_size=step_size)
@@ -114,7 +132,7 @@ if __name__ == "__main__":
    cb = [time_cb, loss_cb]
    if cfg.save_checkpoint:
        cfg_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * step_size,
                                     keep_checkpoint_max=cfg.keep_checkpoint_max)
                                  keep_checkpoint_max=cfg.keep_checkpoint_max)
        ckpt_cb = ModelCheckpoint(prefix="alexnet", directory=ckpt_save_dir, config=cfg_ck)
        cb += [ckpt_cb]

--- a/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl
@@ -0,0 +1,120 @@
 # LeNet Example

 ## Description

 These are examples of training LeNet with dataset in MindSpore.

 ## Requirements

 - Install [MindSpore](https://www.mindspore.cn/install/en).

 - Download the dataset, the directory structure is as follows:

 ```
 └─Data
    ├─test
    │      t10k-images.idx3-ubyte
    │      t10k-labels.idx1-ubyte
    │
    └─train
           train-images.idx3-ubyte
           train-labels.idx1-ubyte
 ```

 ## Structure

 ```shell
 .
 └──lenet
  ├── README.md
  ├── script
    ├── run_distribute_train.sh            # launch distributed training(8 pcs)
    ├── run_eval.sh                        # launch evaluation
    ├── run_standalone_train.sh            # launch standalone training(1 pcs)
    ├── run_distribute_train_gpu.sh        # launch gpu distributed training(8 pcs)
    ├── run_eval_gpu.sh                    # launch gpu evaluation
    └── run_standalone_train_gpu.sh        # launch gpu standalone training(1 pcs)
  ├── src
    ├── config.py                          # parameter configuration
    ├── dataset.py                         # data preprocessing
    └── lenet.py                           # lenet network definition
  ├── eval.py                              # eval net
  └── train.py                             # train net
 ```


 ## Parameter configuration

 Parameters for both training and evaluation can be set in src/config.py.


 ## Running the example

 ### Train

 #### Usage

 ```
 # distributed training
 Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # standalone training
 Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 ```


 #### Launch

 ```
 # distribute training example
 ./run_distribute_train.sh rank_table.json ~/MNIST_data

 # standalone training example
 ./run_standalone_train.sh ~/MNIST_data
 ```

 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).

 #### Result

 Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.

 ```
 epoch: 1 step: 1, loss is 2.3041954
 epoch: 1 step: 2, loss is 2.3079312
 ...
 epoch: 1 step: 601, loss is 2.314184
 epoch: 1 step: 603, loss is 2.305666
 ...
 ```

 ### Evaluation

 #### Usage

 ```
 # evaluation
 Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```

 #### Launch

 ```
 # evaluation example
 ./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt
 ```

 > checkpoint can be produced in training process.


 ### Running on GPU
 ```
 # distributed training example
 ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # standalone training example
 ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # infer example
 ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
--- a/mindinsight/wizard/conf/templates/network/lenet/eval.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/eval.py-tpl
@@ -17,6 +17,8 @@
 eval lenet according to model file:
 python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
 """

 import os
 import argparse

 import mindspore.nn as nn
@@ -37,11 +39,12 @@ if __name__ == "__main__":
                        help='path where the dataset is saved')
    parser.add_argument('--checkpoint_path', type=str, default="", help='if mode is test, must provide\
                        path where the trained ckpt file')
    parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True')
    parser.add_argument('--dataset_sink', action='store_true', help='enable dataset sink or not')

    args = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    device_id = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=device_id)

    network = LeNet5(cfg.num_classes)
    {% if loss=='SoftmaxCrossEntropyWithLogits' %}
@@ -49,12 +52,7 @@ if __name__ == "__main__":
    {% elif loss=='SoftmaxCrossEntropyExpand' %}
    net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
    {% endif %}
    {% if optimizer=='Lamb' %}
    net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr)
    {% elif optimizer=='Momentum' %}
    net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum)
    {% endif %}
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()})

    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(args.checkpoint_path)
@@ -63,5 +61,5 @@ if __name__ == "__main__":
    do_train = False
    ds_eval = create_dataset(data_path=data_path, do_train=do_train, batch_size=cfg.batch_size,
                             target=args.device_target)
    acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
    acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink)
    print("============== {} ==============".format(acc))
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh-tpl
@@ -57,6 +57,9 @@ cp *.sh ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit

 echo "start training"
 env > env.log

 if [ $# == 1 ]
 then
  mpirun --allow-run-as-root -n $RANK_SIZE \
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh-tpl
@@ -61,6 +61,6 @@ cp *.sh ./eval
 cp -r ../src ./eval
 cd ./eval || exit
 env > env.log
 echo "start evaluation for device $DEVICE_ID"
 echo "start evaluation"
 python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
 cd ..
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh-tpl
@@ -65,6 +65,9 @@ cp *.sh ./train
 cp -r ../src ./train
 cd ./train || exit

 echo "start training"
 env > env.log

 if [ $# == 1 ]
 then
  python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
--- a/mindinsight/wizard/conf/templates/network/lenet/src/config.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/src/config.py-tpl
@@ -18,21 +18,15 @@ network config setting, will be used in train.py
 from easydict import EasyDict as edict

 cfg = edict({
    {% if dataset=='MNIST' %}
    'num_classes': 10,
    {% elif dataset=='Cifar10' %}
    'num_classes': 10,
    {% elif dataset=='ImageNet' %}
    'num_classes': 1001,
    {% endif %}
    {% if dataset=='Momentum' %}
    {% if optimizer=='Momentum' %}
    'lr': 0.01,
    "momentum": 0.9,
    {% elif optimizer=='SGD' %}
    'lr': 0.1,
    {% else %}
    'lr': 0.001,
    {% endif %}
    {% if optimizer=='Momentum' %}
    "momentum": 0.9,
    {% endif %}
    'epoch_size': 1,
    'batch_size': 32,
    'buffer_size': 1000,
--- a/mindinsight/wizard/conf/templates/network/lenet/train.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/train.py-tpl
@@ -48,6 +48,7 @@ if __name__ == "__main__":
    if args.device_target == "CPU":
        args.dataset_sink = False

    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    ckpt_save_dir = './'
    if args.run_distribute:
        if args.device_target == 'Ascend':
@@ -62,7 +63,6 @@ if __name__ == "__main__":
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          mirror_mean=True)
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)

    data_path = args.dataset_path
    do_train = True
@@ -79,10 +79,10 @@ if __name__ == "__main__":
    {% elif loss=='SoftmaxCrossEntropyExpand' %}
    net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
    {% endif %}
    {% if optimizer=='Lamb' %}
    net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr)
    {% elif optimizer=='Momentum' %}
    {% if optimizer=='Momentum' %}
    net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum)
    {% else %}
    net_opt = nn.{{ optimizer }}(network.trainable_params(), learning_rate=cfg.lr)
    {% endif %}
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
--- a/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl
@@ -0,0 +1,136 @@
 # ResNet50 Example

 ## Description

 These are examples of training ResNet50 with dataset in MindSpore.

 ## Requirements

 - Install [MindSpore](https://www.mindspore.cn/install/en).

 - Download the dataset, the directory structure is as follows:

 {% if dataset=='Cifar10' %}
 CIFAR-10

 ```
 └─Data
    ├─test
    │      cifar-10-verify-bin
    │
    └─train
           cifar-10-batches-bin
 ```

 {% elif dataset=='ImageNet' %}
 ImageNet

 ```
 └─Data
    ├─test
    │       validation_preprocess
    │
    └─train
            ilsvrc
 ```
 {% endif %}

 ## Structure

 ```shell
 .
 └──resnet50
  ├── README.md
  ├── script
    ├── run_distribute_train.sh            # launch distributed training(8 pcs)
    ├── run_eval.sh                        # launch evaluation
    ├── run_standalone_train.sh            # launch standalone training(1 pcs)
    ├── run_distribute_train_gpu.sh        # launch gpu distributed training(4 pcs)
    ├── run_eval_gpu.sh                    # launch gpu evaluation
    └── run_standalone_train_gpu.sh        # launch gpu standalone training(1 pcs)
  ├── src
    ├── config.py                          # parameter configuration
    ├── crossentropy.py                    # loss definition for ImageNet2012 dataset
    ├── dataset.py                         # data preprocessing
    ├── lr_generator.py                    # generate learning rate for each step
    └── resnet50.py                        # resNet50 network definition
  ├── eval.py                              # eval net
  └── train.py                             # train net
 ```


 ## Parameter configuration

 Parameters for both training and evaluation can be set in src/config.py.


 ## Running the example

 ### Train

 #### Usage

 ```
 # distributed training
 Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # standalone training
 Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 ```


 #### Launch

 ```
 # distribute training example
 ./run_distribute_train.sh rank_table.json ~/dataset_path

 # standalone training example
 ./run_standalone_train.sh ~/dataset_path
 ```

 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).

 #### Result

 Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.

 ```
 epoch: 1 step: 1, loss is 2.3041954
 epoch: 1 step: 2, loss is 2.3079312
 ...
 epoch: 1 step: 601, loss is 2.314184
 epoch: 1 step: 603, loss is 2.305666
 ...
 ```

 ### Evaluation

 #### Usage

 ```
 # evaluation
 Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```

 #### Launch

 ```
 # evaluation example
 ./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt
 ```

 > checkpoint can be produced in training process.


 ### Running on GPU
 ```
 # distributed training example
 ./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # standalone training example
 ./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)

 # infer example
 ./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
--- a/mindinsight/wizard/conf/templates/network/resnet50/dataset/cifar10/dataset.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/dataset/cifar10/dataset.py-tpl
@@ -71,6 +71,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe
    cifar_ds = cifar_ds.repeat(repeat_size)
    return cifar_ds


 def _get_rank_info():
    """
    get rank size and rank id
--- a/mindinsight/wizard/conf/templates/network/resnet50/dataset/imagenet/dataset.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/dataset/imagenet/dataset.py-tpl
@@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size

 from .config import cfg


 def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
    """
    create a train or eval imagenet dataset
@@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe

    return ds


 def _get_rank_info():
    """
    get rank size and rank id
--- a/mindinsight/wizard/conf/templates/network/resnet50/dataset/mnist/dataset.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/dataset/mnist/dataset.py-tpl
@@ -1,98 +0,0 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Produce the dataset
 """
 import os

 import mindspore.dataset as ds
 import mindspore.dataset.transforms.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.transforms.vision import Inter
 from mindspore.common import dtype as mstype
 from mindspore.communication.management import init, get_rank, get_group_size

 from .config import cfg


 def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target='Ascend'):
    """
    create dataset for train or test
    """

    if do_train:
        data_path = os.path.join(data_path, "train")
    else:
        data_path = os.path.join(data_path, "test")

    if target == 'Ascend':
        device_num, rank_id = _get_rank_info()
    elif target == 'GPU':
        init("nccl")
        rank_id = get_rank()
        device_num = get_group_size()
    else:
        device_num = 1

    # define dataset
    if device_num == 1:
        mnist_ds = ds.MnistDataset(data_path)
    else:
        mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True,
                                   num_shards=device_num, shard_id=rank_id)

    resize_height, resize_width = cfg.image_height, cfg.image_width
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081

    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    # apply map operations on images
    mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op)
    mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op)

    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)

    return mnist_ds


 def _get_rank_info():
    """
    get rank size and rank id
    """
    rank_size = int(os.environ.get("RANK_SIZE", 1))

    if rank_size > 1:
        rank_size = get_group_size()
        rank_id = get_rank()
    else:
        rank_size = 1
        rank_id = 0

    return rank_size, rank_id
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl
@@ -16,7 +16,7 @@

 if [ $# != 2 ] && [ $# != 3 ]
 then
 	echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
 	echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi

@@ -31,15 +31,15 @@ get_real_path(){
 PATH1=$(get_real_path $1)
 PATH2=$(get_real_path $2)

 if [ ! -d $PATH1 ]
 if [ ! -f $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
    echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
 exit 1
 fi

 if [ ! -f $PATH2 ]
 if [ ! -d $PATH2 ]
 then
    echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file"
    echo "error: DATASET_PATH=$PATH2 is not a directory"
 exit 1
 fi

@@ -56,16 +56,15 @@ fi

 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
 export MINDSPORE_HCCL_CONFIG_PATH=$PATH2
 export RANK_TABLE_FILE=$PATH2
 export SERVER_ID=0
 rank_start=$((DEVICE_NUM * SERVER_ID))
 export RANK_SIZE=$DEVICE_NUM
 export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
 export RANK_TABLE_FILE=$PATH1

 for((i=0; i<DEVICE_NUM; i++))
 start_id=0
 for((i=start_id; i<DEVICE_NUM + start_id; i++))
 do
    export DEVICE_ID=$i
    export RANK_ID=$((rank_start + i))
    export RANK_ID=$((i - start_id))
    rm -rf ./train_parallel$i
    mkdir ./train_parallel$i
    cp ../*.py ./train_parallel$i
@@ -76,12 +75,12 @@ do
    env > env.log
    if [ $# == 2 ]
    then
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log &
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log &
    fi

    if [ $# == 3 ]
    then
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log &
        python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log &
    fi

    cd ..
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh
@@ -1,53 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ]
 then
 	echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]"
 exit 1
 fi

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)

 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi


 ulimit -u unlimited
 export DEVICE_NUM=4
 export RANK_SIZE=4

 rm -rf ./train_parallel
 mkdir ./train_parallel
 cp ../*.py ./train_parallel
 cp *.sh ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit

 mpirun --allow-run-as-root -n $RANK_SIZE \
 python train.py --run_distribute=True \
 --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl
@@ -0,0 +1,76 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ] && [ $# != 2 ]
 then
 	echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)

 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi

 if [ $# == 2 ]
 then
    PATH2=$(get_real_path $2)
 fi

 if [ $# == 2 ] && [ ! -f $PATH2 ]
 then
    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
 exit 1
 fi

 ulimit -u unlimited
 export DEVICE_NUM=4
 export RANK_SIZE=$DEVICE_NUM

 rm -rf ./train_parallel
 mkdir ./train_parallel
 cp ../*.py ./train_parallel
 cp *.sh ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit

 echo "start training"
 env > env.log

 if [ $# == 1 ]
 then
  mpirun --allow-run-as-root -n $RANK_SIZE \
  python train.py --run_distribute=True \
  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
 fi

 if [ $# == 2 ]
 then
  mpirun --allow-run-as-root -n $RANK_SIZE \
  python train.py --run_distribute=True \
  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH3 &> log &
 fi
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh-tpl
@@ -61,6 +61,6 @@ cp *.sh ./eval
 cp -r ../src ./eval
 cd ./eval || exit
 env > env.log
 echo "start evaluation for device $DEVICE_ID"
 echo "start evaluation"
 python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
 cd ..
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train.sh-tpl
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh
@@ -1,59 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ]
 then
    echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]"
 exit 1
 fi


 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)


 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi


 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=0
 export RANK_ID=0
 export RANK_SIZE=1

 if [ -d "train" ];
 then
    rm -rf ./train
 fi
 mkdir ./train
 cp ../*.py ./train
 cp *.sh ./train
 cp -r ../src ./train
 cd ./train || exit
 python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &

 cd ..
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh-tpl
@@ -0,0 +1,77 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 if [ $# != 1 ] && [ $# != 2 ]
 then
    echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi

 get_real_path(){
  if [ "${1:0:1}" == "/" ]; then
    echo "$1"
  else
    echo "$(realpath -m $PWD/$1)"
  fi
 }

 PATH1=$(get_real_path $1)

 if [ ! -d $PATH1 ]
 then
    echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
 fi

 if [ $# == 2 ]
 then
    PATH2=$(get_real_path $2)
 fi

 if [ $# == 2 ] && [ ! -f $PATH2 ]
 then
    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
 exit 1
 fi

 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=0
 export RANK_ID=0
 export RANK_SIZE=1

 if [ -d "train" ];
 then
    rm -rf ./train
 fi
 mkdir ./train
 cp ../*.py ./train
 cp *.sh ./train
 cp -r ../src ./train
 cd ./train || exit
 echo "start training"
 env > env.log

 if [ $# == 1 ]
 then
    python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
 fi

 if [ $# == 2 ]
 then
    python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
 fi
 cd ..
--- a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl
@@ -28,7 +28,13 @@ cfg = ed({
    "batch_size": 32,
    "loss_scale": 1024,
    {% if optimizer=='Momentum' %}
    "lr": 0.01,
    "momentum": 0.9,
    "lr": 0.01,
    {% elif optimizer=='SGD' %}
    'lr': 0.1,
    {% else %}
    'lr': 0.001,
    {% endif %}
    "image_height": 224,
    "image_width": 224,
@@ -48,7 +54,6 @@ cfg = ed({
    {% endif %}
    "use_label_smooth": True,
    "label_smooth_factor": 0.1,
    "lr": 0.01,
    "lr_init": 0.01,
    "lr_end": 0.00001,
    "lr_max": 0.1
--- a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl
@@ -112,12 +112,11 @@ if __name__ == '__main__':
    lr = Tensor(lr)

    # define opt
    {% if optimizer=='Lamb' %}
    opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr,
                  weight_decay=cfg.weight_decay)
    {% elif optimizer=='Momentum' %}
    {% if optimizer=='Momentum' %}
    opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum,
                      weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale)
    {% else %}
    opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=cfg.lr)
    {% endif %}

    # define loss, model
@@ -125,7 +124,7 @@ if __name__ == '__main__':
        {% if dataset=='ImageNet' %}
        if not cfg.use_label_smooth:
            cfg.label_smooth_factor = 0.0
        loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
        loss = CrossEntLambropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
        {% else %}
        {% if loss=='SoftmaxCrossEntropyWithLogits' %}
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
@@ -143,10 +142,10 @@ if __name__ == '__main__':
        {% elif loss=='SoftmaxCrossEntropyExpand' %}
        loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
        {% endif %}
        {% if optimizer=='Lamb' %}
        opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr)
        {% elif optimizer=='Momentum' %}
        {% if optimizer=='Momentum' %}
        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum)
        {% else %}
        opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr)
        {% endif %}
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

--- a/mindinsight/wizard/network/alexnet.py
+++ b/mindinsight/wizard/network/alexnet.py
@@ -15,4 +15,4 @@ class Network(GenericNetwork):
    name = 'alexnet'
    supported_datasets = ['Cifar10', 'ImageNet']
    supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand']
    supported_optimizers = ['Momentum', 'Lamb']
    supported_optimizers = ['Momentum', 'Adam', 'SGD']
--- a/mindinsight/wizard/network/lenet.py
+++ b/mindinsight/wizard/network/lenet.py
@@ -21,4 +21,4 @@ class Network(GenericNetwork):
    name = 'lenet'
    supported_datasets = ['MNIST']
    supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand']
    supported_optimizers = ['Momentum', 'Lamb']
    supported_optimizers = ['Momentum', 'Adam', 'SGD']
--- a/mindinsight/wizard/network/resnet50.py
+++ b/mindinsight/wizard/network/resnet50.py
@@ -6,7 +6,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """alexnet module."""
 """Resnet50 module."""
 from mindinsight.wizard.network.generic_network import GenericNetwork


@@ -15,4 +15,4 @@ class Network(GenericNetwork):
    name = 'resnet50'
    supported_datasets = ['Cifar10', 'ImageNet']
    supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand']
    supported_optimizers = ['Momentum', 'Lamb']
    supported_optimizers = ['Momentum', 'Adam', 'SGD']