diff --git a/npu/config.py b/npu/config.py deleted file mode 100755 index 22d68e2..0000000 --- a/npu/config.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -network config setting, will be used in train.py -""" - -from easydict import EasyDict as edict - -mnist_cfg = edict({ - 'num_classes': 10, - 'lr': 0.01, - 'momentum': 0.9, - 'epoch_size': 10, - 'batch_size': 32, - 'buffer_size': 1000, - 'image_height': 32, - 'image_width': 32, - 'save_checkpoint_steps': 1875, - 'keep_checkpoint_max': 150, - 'air_name': "lenet", -}) diff --git a/npu/dataset.py b/npu/dataset.py deleted file mode 100755 index df9eecd..0000000 --- a/npu/dataset.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Produce the dataset -""" - -import mindspore.dataset as ds -import mindspore.dataset.vision.c_transforms as CV -import mindspore.dataset.transforms.c_transforms as C -from mindspore.dataset.vision import Inter -from mindspore.common import dtype as mstype - - -def create_dataset(data_path, batch_size=32, repeat_size=1, - num_parallel_workers=1): - """ - create dataset for train or test - """ - # define dataset - mnist_ds = ds.MnistDataset(data_path) - - resize_height, resize_width = 32, 32 - rescale = 1.0 / 255.0 - shift = 0.0 - rescale_nml = 1 / 0.3081 - shift_nml = -1 * 0.1307 / 0.3081 - - # define map operations - resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode - rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) - rescale_op = CV.Rescale(rescale, shift) - hwc2chw_op = CV.HWC2CHW() - type_cast_op = C.TypeCast(mstype.int32) - - # apply map operations on images - mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) - - # apply DatasetOps - buffer_size = 10000 - mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script - mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) - mnist_ds = mnist_ds.repeat(repeat_size) - - return mnist_ds diff --git a/npu/dataset_distributed.py b/npu/dataset_distributed.py deleted file mode 100755 index d813078..0000000 --- a/npu/dataset_distributed.py +++ /dev/null @@ -1,55 +0,0 @@ - -""" -Produce the dataset: -与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: -get_rank:获取当前设备在集群中的ID。 -get_group_size:获取集群数量。 - -""" - -import mindspore.dataset as ds -import mindspore.dataset.vision.c_transforms as CV -import mindspore.dataset.transforms.c_transforms as C -from mindspore.dataset.vision import Inter -from mindspore.common import dtype as mstype -from mindspore.communication.management import init, get_rank, get_group_size - - -def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, - num_parallel_workers=1, shard_id=0, num_shards=8): - """ - create dataset for train or test - """ - - resize_height, resize_width = 32, 32 - rescale = 1.0 / 255.0 - shift = 0.0 - rescale_nml = 1 / 0.3081 - shift_nml = -1 * 0.1307 / 0.3081 - # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. - shard_id = get_rank() - num_shards = get_group_size() - # define dataset - mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) - - # define map operations - resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode - rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) - rescale_op = CV.Rescale(rescale, shift) - hwc2chw_op = CV.HWC2CHW() - type_cast_op = C.TypeCast(mstype.int32) - - # apply map operations on images - mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) - - # apply DatasetOps - buffer_size = 10000 - mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script - mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) - mnist_ds = mnist_ds.repeat(repeat_size) - - return mnist_ds diff --git a/npu/inference.py b/npu/inference.py deleted file mode 100755 index f0501e9..0000000 --- a/npu/inference.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -######################## single-dataset inference lenet example ######################## -This example is a single-dataset inference tutorial. - -######################## Instructions for using the inference environment ######################## -The image of the debugging environment and the image of the inference environment are two different images, -and the working local directories are different. In the inference task, you need to pay attention to the following points. -1、(1)The structure of the dataset uploaded for single dataset inference in this example - MNISTData.zip - ├── test - │ ├── t10k-images-idx3-ubyte - │ └── t10k-labels-idx1-ubyte - └── train - ├── train-images-idx3-ubyte - └── train-labels-idx1-ubyte - - (2)The dataset structure of the single dataset in the inference image in this example - workroot - ├── data - | ├── test - | └── train - -2、Inference task requires predefined functions -(1)Defines whether the task is a inference environment or a debugging environment. -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' #The inference task uses this parameter to represent the local path of the inference image - elif environment == 'debug': - workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -(2)Copy single dataset from obs to inference image. -def ObsToEnv(obs_data_url, data_dir): - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) - return - -(3)Copy ckpt file from obs to inference image. -def ObsUrlToEnv(obs_ckpt_url, ckpt_url): - try: - mox.file.copy(obs_ckpt_url, ckpt_url) - print("Successfully Download {} to {}".format(obs_ckpt_url, - ckpt_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_ckpt_url, ckpt_url) + str(e)) - return - -(4)Copy the output result to obs. -def EnvToObs(train_dir, obs_train_url): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) - return - -3、4 parameters need to be defined. ---data_url is the dataset you selected on the Qizhi platform ---ckpt_url is the weight file you choose on the Qizhi platform - ---data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset, -otherwise an error will be reported. -There is no need to add these parameters to the running parameters of the Qizhi platform, -because they are predefined in the background, you only need to define them in your code. - -4、How the dataset is used -Inference task uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method -of the dataset in the image. -For details, please refer to the following sample code. -""" - -import os -import argparse -import moxing as mox -import mindspore.nn as nn -from mindspore import context -from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore import Tensor -import numpy as np -from glob import glob -from dataset import create_dataset -from config import mnist_cfg as cfg -from lenet import LeNet5 - -### Defines whether the task is a inference environment or a debugging environment ### -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' - elif environment == 'debug': - workroot = '/home/work' - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -### Copy single dataset from obs to inference image ### -def ObsToEnv(obs_data_url, data_dir): - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) - return -### Copy ckpt file from obs to inference image### -### To operate on folders, use mox.file.copy_parallel. If copying a file. -### Please use mox.file.copy to operate the file, this operation is to operate the file -def ObsUrlToEnv(obs_ckpt_url, ckpt_url): - try: - mox.file.copy(obs_ckpt_url, ckpt_url) - print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) - return -### Copy the output result to obs### -def EnvToObs(train_dir, obs_train_url): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) - return - -### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task, -### otherwise an error will be reported. -### There is no need to add these parameters to the running parameters of the Qizhi platform, -### because they are predefined in the background, you only need to define them in your code. -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') -parser.add_argument('--data_url', - type=str, - default= WorkEnvironment('train') + '/data/', - help='path where the dataset is saved') -parser.add_argument('--ckpt_url', - help='model to save/load', - default= WorkEnvironment('train') + '/checkpoint.ckpt') -parser.add_argument('--result_url', - help='result folder to save/load', - default= WorkEnvironment('train') + '/result/') -parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], - help='device where the code will be implemented (default: Ascend)') - -if __name__ == "__main__": - args, unknown = parser.parse_known_args() - ### defining the training environment - environment = 'train' - workroot = WorkEnvironment(environment) - - ###Initialize the data and result directories in the inference image### - data_dir = workroot + '/data' - result_dir = workroot + '/result' - ckpt_url = workroot + '/checkpoint.ckpt' - if not os.path.exists(data_dir): - os.makedirs(data_dir) - if not os.path.exists(result_dir): - os.makedirs(result_dir) - - ###Copy dataset from obs to inference image - obs_data_url = args.data_url - ObsToEnv(obs_data_url, data_dir) - - ###Copy ckpt file from obs to inference image - obs_ckpt_url = args.ckpt_url - ObsUrlToEnv(obs_ckpt_url, ckpt_url) - - ###Set output path result_url - obs_result_url = args.result_url - - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - repeat_size = cfg.epoch_size - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) - - print("============== Starting Testing ==============") - - param_dict = load_checkpoint(os.path.join(ckpt_url)) - load_param_into_net(network, param_dict) - ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator() - data = next(ds_test) - images = data["image"].asnumpy() - labels = data["label"].asnumpy() - print('Tensor:', Tensor(data['image'])) - output = model.predict(Tensor(data['image'])) - predicted = np.argmax(output.asnumpy(), axis=1) - pred = np.argmax(output.asnumpy(), axis=1) - print('predicted:', predicted) - print('pred:', pred) - - print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') - filename = 'result.txt' - file_path = os.path.join(result_dir, filename) - with open(file_path, 'a+') as file: - file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) - - ###Copy result data from the local running environment back to obs, - ###and download it in the inference task corresponding to the Qizhi platform - EnvToObs(result_dir, obs_result_url) \ No newline at end of file diff --git a/npu/lenet.py b/npu/lenet.py deleted file mode 100755 index 0600793..0000000 --- a/npu/lenet.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""LeNet.""" -import mindspore.nn as nn -from mindspore.common.initializer import Normal - - -class LeNet5(nn.Cell): - """ - Lenet network - - Args: - num_class (int): Number of classes. Default: 10. - num_channel (int): Number of channels. Default: 1. - - Returns: - Tensor, output tensor - Examples: - >>> LeNet(num_class=10) - - """ - def __init__(self, num_class=10, num_channel=1, include_top=True): - super(LeNet5, self).__init__() - self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') - self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') - self.relu = nn.ReLU() - self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) - self.include_top = include_top - if self.include_top: - self.flatten = nn.Flatten() - self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) - self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) - self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) - - def construct(self, x): - x = self.conv1(x) - x = self.relu(x) - x = self.max_pool2d(x) - x = self.conv2(x) - x = self.relu(x) - x = self.max_pool2d(x) - if not self.include_top: - return x - x = self.flatten(x) - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - x = self.fc3(x) - return x diff --git a/npu/train.py b/npu/train.py deleted file mode 100755 index 42d09fe..0000000 --- a/npu/train.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -######################## single-dataset train lenet example ######################## -This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training -tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! - -######################## Instructions for using the training environment ######################## -The image of the debugging environment and the image of the training environment are two different images, -and the working local directories are different. In the training task, you need to pay attention to the following points. -1、(1)The structure of the dataset uploaded for single dataset training in this example - MNISTData.zip - ├── test - │ ├── t10k-images-idx3-ubyte - │ └── t10k-labels-idx1-ubyte - └── train - ├── train-images-idx3-ubyte - └── train-labels-idx1-ubyte - - (2)The dataset structure of the single dataset in the training image in this example - workroot - ├── data - | ├── test - | └── train - -2、Single dataset training requires predefined functions -(1)Defines whether the task is a training environment or a debugging environment. -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image - elif environment == 'debug': - workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -(2)Copy single dataset from obs to training image. -def ObsToEnv(obs_data_url, data_dir): - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) - return - -(3)Copy the output model to obs. -def EnvToObs(train_dir, obs_train_url): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) - return - -3、3 parameters need to be defined ---data_url is the dataset you selected on the Qizhi platform - ---data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, -otherwise an error will be reported. -There is no need to add these parameters to the running parameters of the Qizhi platform, -because they are predefined in the background, you only need to define them in your code. - -4、How the dataset is used -A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method -of the dataset in the image. -For details, please refer to the following sample code. - -""" - -import os -import argparse -import moxing as mox -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore import context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.common import set_seed - -### Defines whether the task is a training environment or a debugging environment ### -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' - elif environment == 'debug': - workroot = '/home/work' - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -### Copy single dataset from obs to training image### -def ObsToEnv(obs_data_url, data_dir): - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) - return -### Copy the output model to obs### -def EnvToObs(train_dir, obs_train_url): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) - return - -### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, -### otherwise an error will be reported. -###There is no need to add these parameters to the running parameters of the Qizhi platform, -###because they are predefined in the background, you only need to define them in your code. -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') -parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= WorkEnvironment('train') + '/data/') - -parser.add_argument('--train_url', - help='model folder to save/load', - default= WorkEnvironment('train') + '/model/') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') - -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -if __name__ == "__main__": - args, unknown = parser.parse_known_args() - ### defining the training environment - environment = 'train' - workroot = WorkEnvironment(environment) - - ###Initialize the data and model directories in the training image### - data_dir = workroot + '/data' - train_dir = workroot + '/model' - if not os.path.exists(data_dir): - os.makedirs(data_dir) - if not os.path.exists(train_dir): - os.makedirs(train_dir) - - ### Copy the dataset from obs to the training image ### - ObsToEnv(args.data_url,data_dir) - - ###Specifies the device CPU or Ascend NPU used for training### - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - ds_train = create_dataset(os.path.join(data_dir, "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", - directory=train_dir, - config=config_ck) - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - - ###Copy the trained model data from the local running environment back to obs, - ###and download it in the training task corresponding to the Qizhi platform - EnvToObs(train_dir, args.train_url) - diff --git a/npu/train_dataparallel.py b/npu/train_dataparallel.py deleted file mode 100755 index 19ba828..0000000 --- a/npu/train_dataparallel.py +++ /dev/null @@ -1,205 +0,0 @@ -""" -######################## single-dataset train lenet example ######################## -This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training -tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! - -######################## Instructions for using the training environment ######################## -The image of the debugging environment and the image of the training environment are two different images, -and the working local directories are different. In the training task, you need to pay attention to the following points. -1、(1)The structure of the dataset uploaded for single dataset training in this example - MNISTData.zip - ├── test - │ ├── t10k-images-idx3-ubyte - │ └── t10k-labels-idx1-ubyte - └── train - ├── train-images-idx3-ubyte - └── train-labels-idx1-ubyte - - (2)The dataset structure of the single dataset in the training image in this example - workroot - ├── data - | ├── test - | └── train - -2、Single dataset training requires predefined functions -(1)Defines whether the task is a training environment or a debugging environment. -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image - elif environment == 'debug': - workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -(2)Copy single dataset from obs to training image. -def ObsToEnv(obs_data_url, data_dir): - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) - return - -(3)Copy the output model to obs. -def EnvToObs(train_dir, obs_train_url): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) - return - -3、3 parameters need to be defined ---data_url is the dataset you selected on the Qizhi platform - ---data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, -otherwise an error will be reported. -There is no need to add these parameters to the running parameters of the Qizhi platform, -because they are predefined in the background, you only need to define them in your code. - -4、How the dataset is used -A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method -of the dataset in the image. -For details, please refer to the following sample code. - -""" - -import os -import argparse -from dataset_distributed import create_dataset_parallel -import moxing as mox -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore import context -from mindspore.common import set_seed -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.context import ParallelMode -from mindspore.communication.management import init, get_rank, get_group_size -import mindspore.ops as ops - - -# set device_id and init -device_id = int(os.getenv('DEVICE_ID')) -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") -context.set_context(device_id=device_id) -init() - -### Defines whether the task is a training environment or a debugging environment ### -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' - elif environment == 'debug': - workroot = '/home/work' - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -### Copy single dataset from obs to training image### -def ObsToEnv(obs_data_url, data_dir): - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) - return -### Copy the output model to obs### -def EnvToObs(train_dir, obs_train_url): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) - return - -### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, -### otherwise an error will be reported. -###There is no need to add these parameters to the running parameters of the Qizhi platform, -###because they are predefined in the background, you only need to define them in your code. -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') -parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= WorkEnvironment('train') + '/data/') - -parser.add_argument('--train_url', - help='model folder to save/load', - default= WorkEnvironment('train') + '/model/') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') - -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') -set_seed(114514) -if __name__ == "__main__": - args = parser.parse_args() - ### defining the training environment - environment = 'train' - workroot = WorkEnvironment(environment) - - ###Initialize the data and model directories in the training image### - data_dir = workroot + '/data' - train_dir = workroot + '/model' - if not os.path.exists(data_dir): - os.makedirs(data_dir) - if not os.path.exists(train_dir): - os.makedirs(train_dir) - - ### Copy the dataset from obs to the training image ### - ObsToEnv(args.data_url,data_dir) - - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) - ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - #Note that this method saves the model file on each card. You need to specify the save path on each card. - # In the example, get_rank() is added to distinguish different paths. - ckpoint_cb = ModelCheckpoint(prefix="data_parallel", - directory=train_dir + "/" + str(get_rank()) + "/", - config=config_ck) - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()], dataset_sink_mode=True) - - ###Copy the trained model data from the local running environment back to obs, - ###and download it in the training task corresponding to the Qizhi platform - EnvToObs(train_dir, args.train_url) - diff --git a/npu/train_for_c2net.py b/npu/train_for_c2net.py deleted file mode 100755 index c8cd10a..0000000 --- a/npu/train_for_c2net.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -######################## train lenet example ######################## -train lenet and get network model files(.ckpt) - -The training of the intelligent computing network currently supports single dataset training, and does not require -the obs copy process.It only needs to define two parameters and then call it directly: - train_dir = '/cache/output' #The location of the output - data_dir = '/cache/dataset' #The location of the dataset -""" -#!/usr/bin/python -#coding=utf-8 - -import os -import argparse -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore import context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.common import set_seed - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') - -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -set_seed(1) - -if __name__ == "__main__": - args, unknown = parser.parse_known_args() - print('args:') - print(args) - - ###define two parameters and then call it directly### - train_dir = '/cache/output' - data_dir = '/cache/dataset' - - ###Specifies the device CPU or Ascend NPU used for training### - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - ds_train = create_dataset(os.path.join(data_dir, "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", - directory=train_dir, - config=config_ck) - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - - print("============== Finish Training ==============") \ No newline at end of file diff --git a/npu/train_for_multidataset.py b/npu/train_for_multidataset.py deleted file mode 100755 index 0e08815..0000000 --- a/npu/train_for_multidataset.py +++ /dev/null @@ -1,237 +0,0 @@ -""" -######################## multi-dataset train lenet example ######################## -This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset -training tutorial train.py. This example cannot be used for a single dataset! -""" -""" -######################## Instructions for using the training environment ######################## -1、(1)The structure of the dataset uploaded for multi-dataset training in this example - MNISTData.zip - ├── test - │ ├── t10k-images-idx3-ubyte - │ └── t10k-labels-idx1-ubyte - └── train - ├── train-images-idx3-ubyte - └── train-labels-idx1-ubyte - - checkpoint_lenet-1_1875.zip - ├── checkpoint_lenet-1_1875.ckpt - - (2)The dataset structure in the training image for multiple datasets in this example - workroot - ├── MNISTData - | ├── test - | └── train - └── checkpoint_lenet-1_1875 - ├── checkpoint_lenet-1_1875.ckpt - -2、Multi-dataset training requires predefined functions -(1)Defines whether the task is a training environment or a debugging environment. -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image - elif environment == 'debug': - workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -(2)Copy multiple datasets from obs to training image -def MultiObsToEnv(multi_data_url, workroot): - multi_data_json = json.loads(multi_data_url) #Parse multi_data_url - for i in range(len(multi_data_json)): - path = workroot + "/" + multi_data_json[i]["dataset_name"] - if not os.path.exists(path): - os.makedirs(path) - try: - mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) - print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], - path)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - multi_data_json[i]["dataset_url"], path) + str(e)) - return - -***The input and output of the MultiObsToEnv function in this example: - Input for multi_data_url: - [ - { - "dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e - ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset - "dataset_name": "MNIST_Data" #the name of the dataset - }, - { - "dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c - 59be66-64ec-41ca-b311-f51a486eabf8/", - "dataset_name": "checkpoint_lenet-1_1875" - } - ] - Purpose of multi_data_url: - The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image - and build the dataset path in the training image. - For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData, - The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875 - -(3)Copy the output model to obs. -def EnvToObs(obs_train_url, train_dir): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir, - obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir, - obs_train_url) + str(e)) - return - -3、4 parameters need to be defined ---data_url is the first dataset you selected on the Qizhi platform ---multi_data_url is the multi-dataset you selected on the Qizhi platform - ---data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, -otherwise an error will be reported. -There is no need to add these parameters to the running parameters of the Qizhi platform, -because they are predefined in the background, you only need to define them in your code - -4、How the dataset is used -Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the -calling path of the dataset in the training image. -For example, the calling path of the train folder in the MNIST_Data dataset in this example is -workroot + "/MNIST_Data" +"/train" - -For details, please refer to the following sample code. -""" - -import os -import argparse - -import moxing as mox -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import json -import mindspore.nn as nn -from mindspore import context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.common import set_seed -from mindspore import load_checkpoint, load_param_into_net - -### Defines whether the task is a training environment or a debugging environment ### -def WorkEnvironment(environment): - if environment == 'train': - workroot = '/home/work/user-job-dir' - elif environment == 'debug': - workroot = '/home/ma-user/work' - print('current work mode:' + environment + ', workroot:' + workroot) - return workroot - -### Copy multiple datasets from obs to training image ### -def MultiObsToEnv(multi_data_url, workroot): - multi_data_json = json.loads(multi_data_url) - for i in range(len(multi_data_json)): - path = workroot + "/" + multi_data_json[i]["dataset_name"] - if not os.path.exists(path): - os.makedirs(path) - try: - mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) - print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], - path)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - multi_data_json[i]["dataset_url"], path) + str(e)) - return -### Copy the output model to obs ### -def EnvToObs(obs_train_url, train_dir): - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir, - obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir, - obs_train_url) + str(e)) - return - - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') -### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset, -### otherwise an error will be reported. -### There is no need to add these parameters to the running parameters of the Qizhi platform, -### because they are predefined in the background, you only need to define them in your code. -parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= WorkEnvironment('train') + '/data/') - -parser.add_argument('--multi_data_url', - help='path to multi dataset', - default= WorkEnvironment('train')) - -parser.add_argument('--train_url', - help='model folder to save/load', - default= WorkEnvironment('train') + '/model/') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') - -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -if __name__ == "__main__": - args, unknown = parser.parse_known_args() - # After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to - # copy multiple datasets from obs to the training image - environment = 'train' - workroot = WorkEnvironment(environment) - MultiObsToEnv(args.multi_data_url, workroot) - - ### Define the output path in the training image - train_dir = workroot + '/model' - if not os.path.exists(train_dir): - os.makedirs(train_dir) - - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - #The dataset path is used here:workroot + "/MNIST_Data" +"/train" "" - ds_train = create_dataset(os.path.join(workroot + "/MNISTData", "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - ### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" - load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875", - "checkpoint_lenet-1_1875.ckpt"))) - - if args.device_target != "Ascend": - model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) - else: - model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") - - config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", - directory=train_dir, - config=config_ck) - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - ###Copy the trained model data from the local running environment back to obs, - ###and download it in the training task corresponding to the Qizhi platform - EnvToObs(train_dir, args.train_url) -