diff --git a/npu/config.py b/npu/config.py new file mode 100755 index 0000000..22d68e2 --- /dev/null +++ b/npu/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 150, + 'air_name': "lenet", +}) diff --git a/npu/dataset.py b/npu/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/npu/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/dataset_distributed.py b/npu/dataset_distributed.py new file mode 100755 index 0000000..d813078 --- /dev/null +++ b/npu/dataset_distributed.py @@ -0,0 +1,55 @@ + +""" +Produce the dataset: +与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: +get_rank:获取当前设备在集群中的ID。 +get_group_size:获取集群数量。 + +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype +from mindspore.communication.management import init, get_rank, get_group_size + + +def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1, shard_id=0, num_shards=8): + """ + create dataset for train or test + """ + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. + shard_id = get_rank() + num_shards = get_group_size() + # define dataset + mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/inference.py b/npu/inference.py new file mode 100755 index 0000000..f0501e9 --- /dev/null +++ b/npu/inference.py @@ -0,0 +1,202 @@ +""" +######################## single-dataset inference lenet example ######################## +This example is a single-dataset inference tutorial. + +######################## Instructions for using the inference environment ######################## +The image of the debugging environment and the image of the inference environment are two different images, +and the working local directories are different. In the inference task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset inference in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + (2)The dataset structure of the single dataset in the inference image in this example + workroot + ├── data + | ├── test + | └── train + +2、Inference task requires predefined functions +(1)Defines whether the task is a inference environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The inference task uses this parameter to represent the local path of the inference image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy single dataset from obs to inference image. +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return + +(3)Copy ckpt file from obs to inference image. +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url, + ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_ckpt_url, ckpt_url) + str(e)) + return + +(4)Copy the output result to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +3、4 parameters need to be defined. +--data_url is the dataset you selected on the Qizhi platform +--ckpt_url is the weight file you choose on the Qizhi platform + +--data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +Inference task uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. +""" + +import os +import argparse +import moxing as mox +import mindspore.nn as nn +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import Tensor +import numpy as np +from glob import glob +from dataset import create_dataset +from config import mnist_cfg as cfg +from lenet import LeNet5 + +### Defines whether the task is a inference environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy single dataset from obs to inference image ### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy ckpt file from obs to inference image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return +### Copy the output result to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + type=str, + default= WorkEnvironment('train') + '/data/', + help='path where the dataset is saved') +parser.add_argument('--ckpt_url', + help='model to save/load', + default= WorkEnvironment('train') + '/checkpoint.ckpt') +parser.add_argument('--result_url', + help='result folder to save/load', + default= WorkEnvironment('train') + '/result/') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], + help='device where the code will be implemented (default: Ascend)') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + ### defining the training environment + environment = 'train' + workroot = WorkEnvironment(environment) + + ###Initialize the data and result directories in the inference image### + data_dir = workroot + '/data' + result_dir = workroot + '/result' + ckpt_url = workroot + '/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + ###Copy dataset from obs to inference image + obs_data_url = args.data_url + ObsToEnv(obs_data_url, data_dir) + + ###Copy ckpt file from obs to inference image + obs_ckpt_url = args.ckpt_url + ObsUrlToEnv(obs_ckpt_url, ckpt_url) + + ###Set output path result_url + obs_result_url = args.result_url + + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + repeat_size = cfg.epoch_size + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + + print("============== Starting Testing ==============") + + param_dict = load_checkpoint(os.path.join(ckpt_url)) + load_param_into_net(network, param_dict) + ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator() + data = next(ds_test) + images = data["image"].asnumpy() + labels = data["label"].asnumpy() + print('Tensor:', Tensor(data['image'])) + output = model.predict(Tensor(data['image'])) + predicted = np.argmax(output.asnumpy(), axis=1) + pred = np.argmax(output.asnumpy(), axis=1) + print('predicted:', predicted) + print('pred:', pred) + + print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') + filename = 'result.txt' + file_path = os.path.join(result_dir, filename) + with open(file_path, 'a+') as file: + file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) + + ###Copy result data from the local running environment back to obs, + ###and download it in the inference task corresponding to the Qizhi platform + EnvToObs(result_dir, obs_result_url) \ No newline at end of file diff --git a/npu/lenet.py b/npu/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/npu/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/npu/train.py b/npu/train.py new file mode 100755 index 0000000..4c4df57 --- /dev/null +++ b/npu/train.py @@ -0,0 +1,193 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + (2)The dataset structure of the single dataset in the training image in this example + workroot + ├── data + | ├── test + | └── train + +2、Single dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy single dataset from obs to training image. +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return + +(3)Copy the output model to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy the output model to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + ### defining the training environment + environment = 'train' + workroot = WorkEnvironment(environment) + + ###Initialize the data and model directories in the training image### + data_dir = workroot + '/data' + train_dir = workroot + '/model' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + ### Copy the dataset from obs to the training image ### + ObsToEnv(args.data_url,data_dir) + + ###Specifies the device CPU or Ascend NPU used for training### + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) + diff --git a/npu/train_dataparallel.py b/npu/train_dataparallel.py new file mode 100755 index 0000000..19ba828 --- /dev/null +++ b/npu/train_dataparallel.py @@ -0,0 +1,205 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + (2)The dataset structure of the single dataset in the training image in this example + workroot + ├── data + | ├── test + | └── train + +2、Single dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy single dataset from obs to training image. +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return + +(3)Copy the output model to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +""" + +import os +import argparse +from dataset_distributed import create_dataset_parallel +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.common import set_seed +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank, get_group_size +import mindspore.ops as ops + + +# set device_id and init +device_id = int(os.getenv('DEVICE_ID')) +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +context.set_context(device_id=device_id) +init() + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy the output model to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') +set_seed(114514) +if __name__ == "__main__": + args = parser.parse_args() + ### defining the training environment + environment = 'train' + workroot = WorkEnvironment(environment) + + ###Initialize the data and model directories in the training image### + data_dir = workroot + '/data' + train_dir = workroot + '/model' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + ### Copy the dataset from obs to the training image ### + ObsToEnv(args.data_url,data_dir) + + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In the example, get_rank() is added to distinguish different paths. + ckpoint_cb = ModelCheckpoint(prefix="data_parallel", + directory=train_dir + "/" + str(get_rank()) + "/", + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()], dataset_sink_mode=True) + + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) + diff --git a/npu/train_fail.py b/npu/train_fail.py new file mode 100755 index 0000000..de54b81 --- /dev/null +++ b/npu/train_fail.py @@ -0,0 +1,4 @@ +import abc + +print('hello abc') +abc.func(1) diff --git a/npu/train_for_c2net.py b/npu/train_for_c2net.py new file mode 100755 index 0000000..8356a9f --- /dev/null +++ b/npu/train_for_c2net.py @@ -0,0 +1,90 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +set_seed(1) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + train_dir = '/cache/output' + data_dir = '/cache/dataset' + + #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #创建数据集 + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + #创建网络 + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #定义模型输出路径 + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + #开始训练 + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + print("============== Finish Training ==============") \ No newline at end of file diff --git a/npu/train_for_c2net_dataparallel.py b/npu/train_for_c2net_dataparallel.py new file mode 100755 index 0000000..0812020 --- /dev/null +++ b/npu/train_for_c2net_dataparallel.py @@ -0,0 +1,96 @@ +""" +######################## train lenet dataparallel example ######################## +train lenet and get network model files(.ckpt) + +The training of the intelligent computing network currently supports single dataset training, and does not require +the obs copy process.It only needs to define two parameters and then call it directly: + train_dir = '/cache/output' #The location of the output + data_dir = '/cache/dataset' #The location of the dataset + +""" + +import os +import argparse +from dataset_distributed import create_dataset_parallel +import moxing as mox +from config import mnist_cfg as cfg +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.common import set_seed +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank, get_group_size +import mindspore.ops as ops + + +# set device_id and init +device_id = int(os.getenv('DEVICE_ID')) +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +context.set_context(device_id=device_id) +init() + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') +set_seed(114514) +if __name__ == "__main__": + args = parser.parse_args() + + ###define two parameters and then call it directly### + train_dir = '/cache/output' + data_dir = '/cache/dataset' + + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In the example, get_rank() is added to distinguish different paths. + ckpoint_cb = ModelCheckpoint(prefix="data_parallel", + directory=train_dir + "/" + str(get_rank()) + "/", + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size,ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=False) + + diff --git a/npu/train_for_c2net_tf.py b/npu/train_for_c2net_tf.py new file mode 100755 index 0000000..7c0a511 --- /dev/null +++ b/npu/train_for_c2net_tf.py @@ -0,0 +1,146 @@ +# coding: utf-8 +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data +import os + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' + +mnist = input_data.read_data_sets('mnist_data', one_hot=True) + +#初始化过滤器 +def weight_variable(shape): + return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) + +#初始化偏置,初始化时,所有值是0.1 +def bias_variable(shape): + return tf.Variable(tf.constant(0.1, shape=shape)) + +#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 +#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 +def conv2d(x, W): + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") + + +#池化运算 +def max_pool_2x2(x): + + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") + +#创建x占位符,用于临时存放MNIST图片的数据, +# [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) +x = tf.placeholder(tf.float32, [None, 784], name='input') +#y_存的是实际图像的标签,即对应于每张输入图片实际的值 +y_ = tf.placeholder(tf.float32, [None, 10]) + +#将图片从784维向量重新还原为28×28的矩阵图片, +# 原因参考卷积神经网络模型图,最后一个参数代表深度, +# 因为MNIST是黑白图片,所以深度为1, +# 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 +x_image = tf.reshape(x, [-1, 28, 28, 1]) + +#第一层卷积 +#将过滤器设置成5×5×1的矩阵, +#其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 +#32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 +W_conv1 = weight_variable([5, 5, 1, 32]) +#有多少个特征图就有多少个偏置 +b_conv1 = bias_variable([32]) +#使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 +h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) +#卷积以后再经过池化操作 +h_pool1 = max_pool_2x2(h_conv1) + +#第二层卷积 +#因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 +W_conv2 = weight_variable([5, 5, 32, 64]) +b_conv2 = bias_variable([64]) +h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) +h_pool2 = max_pool_2x2(h_conv2) + +#全连接层 +#经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), +#第二层池化后输出为(14/2)×(14/2)),深度为64, +#我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] +W_fc1 = weight_variable([7 * 7 * 64, 1024]) +#偏置的个数和权重的个数一致 +b_fc1 = bias_variable([1024]) +#这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) +h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) +#使用ReLU激活函数 +h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) + +#dropout +#为了减少过拟合,我们在输出层之前加入dropout +keep_prob = tf.placeholder(tf.float32, name='keep_prob') +h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) + +#输出层 +#全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), +# 所以这里权重W的尺寸为[1024, 10] +W_fc2 = weight_variable([1024, 10]) +b_fc2 = bias_variable([10]) + +#最后都要经过Softmax函数将输出转化为概率问题 +y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') + +#损失函数和损失优化 +cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) +train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) + +#测试准确率,跟Softmax回归模型的一样 +correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) +accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + +# #将训练结果保存,如果不保存我们这次训练结束后的结果也随着程序运行结束而释放了 +# savePath = './mnist_conv/' +# saveFile = savePath + 'mnist_conv.ckpt' +# if os.path.exists(savePath) == False: +# os.mkdir(savePath) + +# saver = tf.train.Saver() + + +#开始训练 +with tf.Session() as sess: + + #初始化所有变量 + sess.run(tf.global_variables_initializer()) + + #训练两万次 + for i in range(2000): + + #每次获取50张图片数据和对应的标签 + batch = mnist.train.next_batch(50) + + #每训练100次,我们打印一次训练的准确率 + if i % 100 == 0: + train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) + print("step %d, training accuracy %g" % (i, train_accuracy)) + + #这里是真的训练,将数据传入 + sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) + + + + # print ("end train, start testing...") + # mean_value = 0.0 + # for i in range(mnist.test.labels.shape[0]): + # batch = mnist.test.next_batch(50) + # train_accuracy = sess.run(accuracy, feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}) + # mean_value += train_accuracy + + + + # print("test accuracy %g" % (mean_value / mnist.test.labels.shape[0])) + # #训练结束后,我们使用mnist.test在测试最后的准确率 + # print("test accuracy %g" % sess.run(accuracy, feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0})) + + + # 最后,将会话保存下来 + # saver.save(sess, saveFile) + + # 用SavedModel的方式保存 + tf.compat.v1.saved_model.simple_save(sess, + "/cache/output/saved_model", + inputs={"input": x, 'keep_prob':keep_prob}, + outputs={"output": y_conv}) diff --git a/npu/train_for_multidataset.py b/npu/train_for_multidataset.py new file mode 100755 index 0000000..a2b7694 --- /dev/null +++ b/npu/train_for_multidataset.py @@ -0,0 +1,237 @@ +""" +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy multiple datasets from obs to training image +def MultiObsToEnv(multi_data_url, workroot): + multi_data_json = json.loads(multi_data_url) #Parse multi_data_url + for i in range(len(multi_data_json)): + path = workroot + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], + path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return + +***The input and output of the MultiObsToEnv function in this example: + Input for multi_data_url: + [ + { + "dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e + ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset + "dataset_name": "MNIST_Data" #the name of the dataset + }, + { + "dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c + 59be66-64ec-41ca-b311-f51a486eabf8/", + "dataset_name": "checkpoint_lenet-1_1875" + } + ] + Purpose of multi_data_url: + The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image + and build the dataset path in the training image. + For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData, + The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875 + +(3)Copy the output model to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + +3、4 parameters need to be defined +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +workroot + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed +from mindspore import load_checkpoint, load_param_into_net + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/ma-user/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, workroot): + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = workroot + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], + path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return +### Copy the output model to obs ### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--multi_data_url', + help='path to multi dataset', + default= WorkEnvironment('train')) + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + # After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to + # copy multiple datasets from obs to the training image + environment = 'train' + workroot = WorkEnvironment(environment) + MultiObsToEnv(args.multi_data_url, workroot) + + ### Define the output path in the training image + train_dir = workroot + '/model' + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #The dataset path is used here:workroot + "/MNIST_Data" +"/train" "" + ds_train = create_dataset(os.path.join(workroot + "/MNISTData", "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + ### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" + load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875", + "checkpoint_lenet-1_1875.ckpt"))) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) + diff --git a/npu/train_for_multidataset_dataparallel.py b/npu/train_for_multidataset_dataparallel.py new file mode 100755 index 0000000..2d5e27d --- /dev/null +++ b/npu/train_for_multidataset_dataparallel.py @@ -0,0 +1,249 @@ +""" +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy multiple datasets from obs to training image +def MultiObsToEnv(multi_data_url, workroot): + multi_data_json = json.loads(multi_data_url) #Parse multi_data_url + for i in range(len(multi_data_json)): + path = workroot + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], + path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return + +***The input and output of the MultiObsToEnv function in this example: + Input for multi_data_url: + [ + { + "dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e + ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset + "dataset_name": "MNIST_Data" #the name of the dataset + }, + { + "dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c + 59be66-64ec-41ca-b311-f51a486eabf8/", + "dataset_name": "checkpoint_lenet-1_1875" + } + ] + Purpose of multi_data_url: + The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image + and build the dataset path in the training image. + For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData, + The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875 + +(3)Copy the output model to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + +3、4 parameters need to be defined +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +workroot + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset_distributed import create_dataset_parallel +from dataset import create_dataset +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank, get_group_size +import mindspore.ops as ops + +# set device_id and init +device_id = int(os.getenv('ASCEND_DEVICE_ID')) +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +context.set_context(device_id=device_id) +init() + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/ma-user/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, workroot): + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = workroot + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], + path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return +### Copy the output model to obs ### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--multi_data_url', + help='path to multi dataset', + default= WorkEnvironment('train')) + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') +set_seed(114514) +if __name__ == "__main__": + args = parser.parse_args() + # After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to + # copy multiple datasets from obs to the training image + environment = 'train' + workroot = WorkEnvironment(environment) + MultiObsToEnv(args.multi_data_url, workroot) + + ### Define the output path in the training image + train_dir = workroot + '/model' + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + ### Copy the dataset from obs to the training image ### + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + ds_train = create_dataset_parallel(os.path.join(workroot + "/MNISTData", "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + ### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" + load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875", + "checkpoint_lenet-1_1875.ckpt"))) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In the example, get_rank() is added to distinguish different paths. + ckpoint_cb = ModelCheckpoint(prefix="data_parallel", + directory=train_dir + "/" + str(get_rank()) + "/", + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) +