diff --git a/Example_Picture/数据集上传位置.png b/Example_Picture/数据集上传位置.png new file mode 100755 index 0000000..e1a6fce Binary files /dev/null and b/Example_Picture/数据集上传位置.png differ diff --git a/Example_Picture/新建训练任务页面.png b/Example_Picture/新建训练任务页面.png new file mode 100755 index 0000000..4ff5a1e Binary files /dev/null and b/Example_Picture/新建训练任务页面.png differ diff --git a/Example_Picture/查看日志页面.png b/Example_Picture/查看日志页面.png new file mode 100755 index 0000000..7ea7f97 Binary files /dev/null and b/Example_Picture/查看日志页面.png differ diff --git a/Example_Picture/模型下载页面.png b/Example_Picture/模型下载页面.png new file mode 100755 index 0000000..61aafac Binary files /dev/null and b/Example_Picture/模型下载页面.png differ diff --git a/Example_Picture/运行参数界面.png b/Example_Picture/运行参数界面.png new file mode 100755 index 0000000..16ef61c Binary files /dev/null and b/Example_Picture/运行参数界面.png differ diff --git a/config.py b/config.py new file mode 100755 index 0000000..e191906 --- /dev/null +++ b/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 10, + 'air_name': "lenet", +}) diff --git a/convert_pytorch.py b/convert_pytorch.py new file mode 100755 index 0000000..0aeb4ff --- /dev/null +++ b/convert_pytorch.py @@ -0,0 +1,59 @@ +import torchvision +import torch +import argparse +from torch.autograd import Variable +import onnx +print(torch.__version__) + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument('--model', + type=str, + help='path to training/inference dataset folder' + ) +parser.add_argument('--n', + type=int, + default=256, + help='batch size for input shape type' + ) +parser.add_argument('--c', + type=int, + default=1, + help='channel for input shape type' + ) +parser.add_argument('--h', + type=int, + default=28, + help='height for input shape type' + ) +parser.add_argument('--w', + type=int, + default=28, + help='width for input shape type' + ) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + model_file = '/dataset/' + args.model + print(model_file) + model = torch.load(model_file) + print(model) + print(type(model)) + for k, v in model.named_parameters(): + print("k:",k) + print("v:",v.shape) + + suffix = args.model.rindex(".") + out_file = '/model/' + args.model + ".onnx" + if suffix!=-1 : + out_file = '/model/' + args.model[0:suffix] + ".onnx" + print(out_file) + input_name = ['input'] + output_name = ['output'] + input = Variable(torch.randn(args.n, args.c, args.h, args.w)) + torch.onnx.export(model, input, out_file, input_names=input_name, output_names=output_name, verbose=True) + + diff --git a/convert_to_onnx.py b/convert_to_onnx.py new file mode 100755 index 0000000..4390052 --- /dev/null +++ b/convert_to_onnx.py @@ -0,0 +1,22 @@ +import numpy as np +from mindspore import Tensor, export, load_checkpoint +from mindvision.classification.models import resnet50 +from mindvision.dataset import DownLoad + +# 下载Resnet50的预训练模型 +dl = DownLoad() +dl.download_url('https://download.mindspore.cn/vision/classification/resnet50_224.ckpt') + +resnet = resnet50(1000) +load_checkpoint("resnet50_224.ckpt", net=resnet) + +input_np = np.random.uniform(0.0, 1.0, size=[1, 3, 224, 224]).astype(np.float32) + +# 导出文件resnet50_224.mindir到当前文件夹 +export(resnet, Tensor(input_np), file_name='resnet50_224', file_format='MINDIR') + + + +# 保存resnet50_224.onnx文件到当前目录下 +export(resnet, Tensor(input_np), file_name='resnet50_224', file_format='ONNX') + diff --git a/dataset.py b/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/gpu/inference.py b/gpu/inference.py new file mode 100755 index 0000000..c5e0ad8 --- /dev/null +++ b/gpu/inference.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +#coding=utf-8 +''' +GPU INFERENCE INSTANCE + +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 +Due to the adaptability of a100, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the environment, the uploaded dataset will be automatically placed in the /dataset directory. +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/test; + +The model file selected is in /model directory. +The result download path is under /result . and the Qizhi platform will provide file downloads under the /result directory. +由于a100的适配性,请使用含cuda 11的平台镜像. +本例中的镜像是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +选择的数据集被放置在/dataset目录 +选择的模型文件放置在/model目录 +输出结果路径是/result目录 + +''' + + +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import os +import argparse + + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#获取模型文件名称 +parser.add_argument('--modelname', help='model name') + + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + test_dataset = mnist.MNIST(root='/dataset/test', train=False, transform=ToTensor(), + download=False) + test_loader = DataLoader(test_dataset, batch_size=256) + #如果文件名确定,model_path可以直接写死 + model_path = '/model/'+args.modelname + + model = torch.load(model_path).to(device) + model.eval() + + correct = 0 + _sum = 0 + + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #结果写入/result + filename = 'result.txt' + file_path = os.path.join('/result', filename) + with open(file_path, 'w') as file: + file.write('accuracy: {:.2f}'.format(correct / _sum)) \ No newline at end of file diff --git a/gpu/model.py b/gpu/model.py new file mode 100755 index 0000000..ae424a7 --- /dev/null +++ b/gpu/model.py @@ -0,0 +1,35 @@ +from torch.nn import Module +from torch import nn + + +class Model(Module): + def __init__(self): + super(Model, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.relu1 = nn.ReLU() + self.pool1 = nn.MaxPool2d(2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.relu2 = nn.ReLU() + self.pool2 = nn.MaxPool2d(2) + self.fc1 = nn.Linear(256, 120) + self.relu3 = nn.ReLU() + self.fc2 = nn.Linear(120, 84) + self.relu4 = nn.ReLU() + self.fc3 = nn.Linear(84, 10) + self.relu5 = nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + y = self.pool1(y) + y = self.conv2(y) + y = self.relu2(y) + y = self.pool2(y) + y = y.view(y.shape[0], -1) + y = self.fc1(y) + y = self.relu3(y) + y = self.fc2(y) + y = self.relu4(y) + y = self.fc3(y) + y = self.relu5(y) + return y diff --git a/gpu/train.py b/gpu/train.py new file mode 100755 index 0000000..ccedb05 --- /dev/null +++ b/gpu/train.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print('epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/gpu/train_for_c2net.py b/gpu/train_for_c2net.py new file mode 100755 index 0000000..4361c18 --- /dev/null +++ b/gpu/train_for_c2net.py @@ -0,0 +1,71 @@ +''' +在训练环境中,代码会自动放在/tmp/code目录下,上传的数据集会自动放在/tmp/dataset目录下,模型下载路径默认在/tmp/output下,请将模型输出位置指定到/tmp/model, +启智平台界面会提供/tmp/output目录下的文件下载。 +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#数据集位置放在/tmp/dataset下 +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args = parser.parse_args() + #日志输出 + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + #日志输出 + print('epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + #日志输出 + print('accuracy: {:.2f}'.format(correct / _sum)) + #模型输出位置放在/tmp/output下 + torch.save(model, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) \ No newline at end of file diff --git a/gpu/train_for_multidataset.py b/gpu/train_for_multidataset.py new file mode 100755 index 0000000..0eeec2f --- /dev/null +++ b/gpu/train_for_multidataset.py @@ -0,0 +1,108 @@ +''' +1,本示例中多数据集训练上传的数据集结构 + MnistDataset_torch.zip + ├── test + └── train + + checkpoint_epoch1_0.73.zip + ├── mnist_epoch1_0.73.pkl + +2,由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, +本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 +提交镜像,再切到训练环境训练已跑通的代码。 +在训练环境中,上传的数据集会自动放在/dataset目录下,注意:选择单数据集和多数据集时的路径不同! +(1)如果是单数据集:如选择的是MnistDataset_torch.zip,则数据集目录为/dataset/train、/dataset/test; + 本示例中单数据集在训练镜像中的数据集结构 + dataset + ├── test + └── train +(2)如选择的是多数据集,如选择的是MnistDataset_torch.zip和checkpoint_epoch1_0.73.zip,则数据集 +目录为/dataset/MnistDataset_torch/train、/dataset/MnistDataset_torch/test +和/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + 本示例中多数据集在训练镜像中的数据集结构 + dataset + ├── MnistDataset_torch + | ├── test + | └── train + └── checkpoint_epoch1_0.73 + ├── mnist_epoch1_0.73.pkl + + +模型下载路径默认在/model下,请将模型输出位置指定到/model,启智平台界面会提供/model目录下的文件下载。 + + + +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#数据集位置放在/dataset下 +parser.add_argument('--traindata', default="/dataset/MnistDataset_torch/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/MnistDataset_torch/test" ,help='path to test dataset') +parser.add_argument('--checkpoint', default="/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl" ,help='checkpoint file') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args = parser.parse_args() + #日志输出 + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + #日志输出 + print('epoch_size is:{}'.format(epoch)) + #加载已训练好的模型: + # path = args.checkpoint + # checkpoint = torch.load(path, map_location=device) + # model.load_state_dict(checkpoint) + #开始训练 + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + #日志输出 + print('accuracy: {:.2f}'.format(correct / _sum)) + #模型输出位置放在/model下 + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) \ No newline at end of file diff --git a/gpu_train.py b/gpu_train.py new file mode 100755 index 0000000..9a80582 --- /dev/null +++ b/gpu_train.py @@ -0,0 +1,74 @@ +''' +由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, +本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 +提交镜像,再切到训练环境训练已跑通的代码。 +在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model, +启智平台界面会提供/model目录下的文件下载。 +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#数据集位置放在/dataset下 +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args = parser.parse_args() + #日志输出 + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + #日志输出 + print('epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + #日志输出 + print('accuracy: {:.2f}'.format(correct / _sum)) + #模型输出位置放在/model下 + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/gpu_train_resnet50.py b/gpu_train_resnet50.py new file mode 100755 index 0000000..9bf6157 --- /dev/null +++ b/gpu_train_resnet50.py @@ -0,0 +1,30 @@ +''' +由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, +本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 +提交镜像,再切到训练环境训练已跑通的代码。 +在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model, +启智平台界面会提供/model目录下的文件下载。 +''' + +import torchvision +from torch.autograd import Variable +import torch +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='Resnet50 Example') +#数据集位置放在/dataset下 +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + input_name = ['input'] + output_name = ['output'] + input = Variable(torch.randn(1, 3, 224, 224)).cuda() + model = torchvision.models.resnet50(pretrained=True).cuda() + + #模型输出位置放在/model下 + torch.save(model, '/model/resnet50.pth') + diff --git a/grampus_tf_train.py b/grampus_tf_train.py new file mode 100755 index 0000000..86bb261 --- /dev/null +++ b/grampus_tf_train.py @@ -0,0 +1,154 @@ +# coding: utf-8 +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data +import os +import argparse +import moxing as mox + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +workroot = '/cache/' +#初始化过滤器 +def weight_variable(shape): + return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) + +#初始化偏置,初始化时,所有值是0.1 +def bias_variable(shape): + return tf.Variable(tf.constant(0.1, shape=shape)) + +#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 +#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 +def conv2d(x, W): + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") + + +#池化运算 +def max_pool_2x2(x): + + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") + +def parse_args(): + parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +# define 2 parameters for running on modelArts +# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 + parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= workroot + '/dataset/') + + parser.add_argument('--train_url', + help='model folder to save/load', + default= workroot + '/output/') + parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +#modelarts已经默认使用data_url和train_url + parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_args() + print('args:') + print(args) + + mnist = input_data.read_data_sets('mnist_data', one_hot=True) + + #创建x占位符,用于临时存放MNIST图片的数据, + # [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) + x = tf.placeholder(tf.float32, [None, 784], name='input') + #y_存的是实际图像的标签,即对应于每张输入图片实际的值 + y_ = tf.placeholder(tf.float32, [None, 10]) + + #将图片从784维向量重新还原为28×28的矩阵图片, + # 原因参考卷积神经网络模型图,最后一个参数代表深度, + # 因为MNIST是黑白图片,所以深度为1, + # 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 + x_image = tf.reshape(x, [-1, 28, 28, 1]) + + #第一层卷积 + #将过滤器设置成5×5×1的矩阵, + #其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 + #32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 + W_conv1 = weight_variable([5, 5, 1, 32]) + #有多少个特征图就有多少个偏置 + b_conv1 = bias_variable([32]) + #使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 + h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) + #卷积以后再经过池化操作 + h_pool1 = max_pool_2x2(h_conv1) + + #第二层卷积 + #因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 + W_conv2 = weight_variable([5, 5, 32, 64]) + b_conv2 = bias_variable([64]) + h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) + h_pool2 = max_pool_2x2(h_conv2) + + #全连接层 + #经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), + #第二层池化后输出为(14/2)×(14/2)),深度为64, + #我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] + W_fc1 = weight_variable([7 * 7 * 64, 1024]) + #偏置的个数和权重的个数一致 + b_fc1 = bias_variable([1024]) + #这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) + h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) + #使用ReLU激活函数 + h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) + + #dropout + #为了减少过拟合,我们在输出层之前加入dropout + keep_prob = tf.placeholder(tf.float32, name='keep_prob') + h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) + + #输出层 + #全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), + # 所以这里权重W的尺寸为[1024, 10] + W_fc2 = weight_variable([1024, 10]) + b_fc2 = bias_variable([10]) + + #最后都要经过Softmax函数将输出转化为概率问题 + y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') + + #损失函数和损失优化 + cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) + train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) + + #测试准确率,跟Softmax回归模型的一样 + correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + + train_dir = '/cache/output/' #模型存放路径 + if not os.path.exists(train_dir): + os.mkdir(train_dir) + obs_train_url = args.train_url + #开始训练 + with tf.Session() as sess: + #初始化所有变量 + sess.run(tf.global_variables_initializer()) + #训练两万次 + for i in range(2000): + #每次获取50张图片数据和对应的标签 + batch = mnist.train.next_batch(50) + #每训练100次,我们打印一次训练的准确率 + if i % 100 == 0: + train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) + print("step %d, training accuracy %g" % (i, train_accuracy)) + #这里是真的训练,将数据传入 + sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) + + # 用SavedModel的方式保存 + tf.compat.v1.saved_model.simple_save(sess, + train_dir +"saved_model", + inputs={"input": x, 'keep_prob':keep_prob}, + outputs={"output": y_conv}) + diff --git a/grampus_train.py b/grampus_train.py new file mode 100755 index 0000000..81116b7 --- /dev/null +++ b/grampus_train.py @@ -0,0 +1,74 @@ +''' +由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, +本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 +提交镜像,再切到训练环境训练已跑通的代码。 +在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model, +启智平台界面会提供/model目录下的文件下载。 +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#数据集位置放在/dataset下 +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args = parser.parse_args() + #日志输出 + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + #日志输出 + print('epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + #日志输出 + print('accuracy: {:.2f}'.format(correct / _sum)) + #模型输出位置放在/model下 + torch.save(model, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) \ No newline at end of file diff --git a/inference.py b/inference.py new file mode 100755 index 0000000..3fb5271 --- /dev/null +++ b/inference.py @@ -0,0 +1,156 @@ +""" +######################## inference lenet example ######################## +inference lenet according to model file +""" + +""" +######################## 推理环境使用说明 ######################## +1、在推理环境中,需要将数据集从obs拷贝到推理镜像中,推理完以后,需要将输出的结果拷贝到obs. +(1)将数据集从obs拷贝到推理镜像中: + obs_data_url = args.data_url + args.data_url = '/home/work/user-job-dir/data/' + if not os.path.exists(args.data_url): + os.mkdir(args.data_url) + try: + mox.file.copy_parallel(obs_data_url, args.data_url) + print("Successfully Download {} to {}".format(obs_data_url, + args.data_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_data_url, args.data_url) + str(e)) + +(2)将模型文件从obs拷贝到推理镜像中: + obs_ckpt_url = args.ckpt_url + args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt' + try: + mox.file.copy(obs_ckpt_url, args.ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url, + args.ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_ckpt_url, args.ckpt_url) + str(e)) + +(3)将输出的结果拷贝回obs: + obs_result_url = args.result_url + args.result_url = '/home/work/user-job-dir/result/' + if not os.path.exists(args.result_url): + os.mkdir(args.result_url) + try: + mox.file.copy_parallel(args.result_url, obs_result_url) + print("Successfully Upload {} to {}".format(args.result_url, obs_result_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e)) +详细代码可参考以下示例代码: +""" + +import os +import argparse +import moxing as mox +import mindspore.nn as nn +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import Tensor +import numpy as np +from glob import glob +from dataset import create_dataset +from config import mnist_cfg as cfg +from lenet import LeNet5 + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], + help='device where the code will be implemented (default: Ascend)') + parser.add_argument('--data_url', + type=str, + default="./Data", + help='path where the dataset is saved') + parser.add_argument('--ckpt_url', + help='model to save/load', + default='./ckpt_url') + parser.add_argument('--result_url', + help='result folder to save/load', + default='./result') + + args = parser.parse_args() + + #将数据集从obs拷贝到推理镜像中: + obs_data_url = args.data_url + args.data_url = '/home/work/user-job-dir/data/' + if not os.path.exists(args.data_url): + os.mkdir(args.data_url) + try: + mox.file.copy_parallel(obs_data_url, args.data_url) + print("Successfully Download {} to {}".format(obs_data_url, + args.data_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_data_url, args.data_url) + str(e)) + + #对文件夹进行操作,请使用mox.file.copy_parallel。如果拷贝一个文件。请使用mox.file.copy对文件操作,本次操作是对文件进行操作 + #将模型文件从obs拷贝到推理镜像中: + obs_ckpt_url = args.ckpt_url + args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt' + try: + mox.file.copy(obs_ckpt_url, args.ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url, + args.ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_ckpt_url, args.ckpt_url) + str(e)) + + #设置输出路径result_url + obs_result_url = args.result_url + args.result_url = '/home/work/user-job-dir/result/' + if not os.path.exists(args.result_url): + os.mkdir(args.result_url) + + args.dataset_path = args.data_url + args.save_checkpoint_path = args.ckpt_url + + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + repeat_size = cfg.epoch_size + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + + print("============== Starting Testing ==============") + args.load_ckpt_url = os.path.join(args.save_checkpoint_path) + print("args.load_ckpt_url is:{}", args.load_ckpt_url ) + param_dict = load_checkpoint(args.load_ckpt_url ) + load_param_into_net(network, param_dict) + # 定义测试数据集,batch_size设置为1,则取出一张图片 + ds_test = create_dataset(os.path.join(args.dataset_path, "test"), batch_size=1).create_dict_iterator() + data = next(ds_test) + + # images为测试图片,labels为测试图片的实际分类 + images = data["image"].asnumpy() + labels = data["label"].asnumpy() + print('Tensor:', Tensor(data['image'])) + + # 使用函数model.predict预测image对应分类 + output = model.predict(Tensor(data['image'])) + predicted = np.argmax(output.asnumpy(), axis=1) + pred = np.argmax(output.asnumpy(), axis=1) + print('predicted:', predicted) + print('pred:', pred) + + # 输出预测分类与实际分类,并输出到result_url + print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') + filename = 'result.txt' + file_path = os.path.join(args.result_url, filename) + with open(file_path, 'a+') as file: + file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) + + # Upload results to obs + ######################## 将输出的结果拷贝到obs(固定写法) ######################## + # 把推理后的结果从本地的运行环境拷贝回obs,在启智平台相对应的推理任务中会提供下载 + try: + mox.file.copy_parallel(args.result_url, obs_result_url) + print("Successfully Upload {} to {}".format(args.result_url, obs_result_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e)) + ######################## 将输出的模型拷贝到obs ######################## \ No newline at end of file diff --git a/lenet.py b/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/model.py b/model.py new file mode 100755 index 0000000..9a7f565 --- /dev/null +++ b/model.py @@ -0,0 +1,35 @@ +from torch.nn import Module +from torch import nn + + +class Model(Module): + def __init__(self): + super(Model, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.relu1 = nn.ReLU() + self.pool1 = nn.MaxPool2d(2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.relu2 = nn.ReLU() + self.pool2 = nn.MaxPool2d(2) + self.fc1 = nn.Linear(256, 120) + self.relu3 = nn.ReLU() + self.fc2 = nn.Linear(120, 84) + self.relu4 = nn.ReLU() + self.fc3 = nn.Linear(84, 10) + self.relu5 = nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + y = self.pool1(y) + y = self.conv2(y) + y = self.relu2(y) + y = self.pool2(y) + y = y.view(y.shape[0], -1) + y = self.fc1(y) + y = self.relu3(y) + y = self.fc2(y) + y = self.relu4(y) + y = self.fc3(y) + y = self.relu5(y) + return y diff --git a/npu/Example_Picture/数据集上传位置.png b/npu/Example_Picture/数据集上传位置.png new file mode 100755 index 0000000..e1a6fce Binary files /dev/null and b/npu/Example_Picture/数据集上传位置.png differ diff --git a/npu/Example_Picture/新建训练任务页面.png b/npu/Example_Picture/新建训练任务页面.png new file mode 100755 index 0000000..4ff5a1e Binary files /dev/null and b/npu/Example_Picture/新建训练任务页面.png differ diff --git a/npu/Example_Picture/查看日志页面.png b/npu/Example_Picture/查看日志页面.png new file mode 100755 index 0000000..7ea7f97 Binary files /dev/null and b/npu/Example_Picture/查看日志页面.png differ diff --git a/npu/Example_Picture/模型下载页面.png b/npu/Example_Picture/模型下载页面.png new file mode 100755 index 0000000..61aafac Binary files /dev/null and b/npu/Example_Picture/模型下载页面.png differ diff --git a/npu/Example_Picture/运行参数界面.png b/npu/Example_Picture/运行参数界面.png new file mode 100755 index 0000000..16ef61c Binary files /dev/null and b/npu/Example_Picture/运行参数界面.png differ diff --git a/npu/README.md b/npu/README.md new file mode 100755 index 0000000..9a1e647 --- /dev/null +++ b/npu/README.md @@ -0,0 +1,71 @@ + +# 如何在启智平台上进行模型训练 - NPU版本 + +## 1 概述 +- 本项目以LeNet-MNIST为例,简要介绍如何在启智AI协同平台上使用MindSpore完成训练任务,旨在为AI初学者提供云脑训练示例。 +- 大家可以直接使用本项目提供的数据集和代码文件创建自己的训练任务。 +- 启智平台对接ModelArts和OBS,将数据集,代码,训练资源池等整合在启智AI协同平台上供开发者使用。 + - ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在ModelArts下体验MindSpore。 + - OBS是华为云提供的存储方式。 + +## 2 准备工作 +- 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://git.openi.org.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。 + +### 2.1 数据准备 +#### 数据集下载 +- 数据集可从本项目的数据集目录中下载,[数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/datasets?type=1) +- 数据文件说明 + - MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。 + - 数据集文件的目录结构如下: +> MNIST_Data +> ├── test +> │ ├── t10k-images-idx3-ubyte +> │ └── t10k-labels-idx1-ubyte +> └── train +> ├── train-images-idx3-ubyte +> └── train-labels-idx1-ubyte + +#### 数据集上传 +- 由于本示例使用的是Mindspore开发,需要在NPU芯片运行,所以上传的数据集需要传到NPU界面。\ +【注意:如果你需要试运行本示例,则无需再次上传数据集,因为本示例中的数据集MNIST_Example已经设置为公开数据集,可以直接引用】 +- 如下所示: +- ![avatar](Example_Picture/数据集上传位置.png) +### 2.2 执行脚本准备 +#### 示例代码 +- 示例代码可从本仓库中下载,[代码下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example) +- 代码文件说明 + - [train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py),用于训练的脚本文件,包括将数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)的代码注释 + + - [inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py),用于推理的脚本文件。 + + - [config.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/config.py),网络配置信息,在train.py中会使用到。 + + - [dataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/dataset.py),对原始数据集进行预处理,产生可用于网络训练的数据集。 + + - [lenet.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/lenet.py),使用的训练网络,在train.py中会使用到。 + +## 3 创建训练任务 +- 准备好数据和执行脚本以后,需要创建训练任务将MindSpore脚本真正运行起来。首次使用的用户可参考本示例代码。 + +### 使用MindSpore作为训练框架创建训练作业,界面截图如下图所示。 +![avatar](Example_Picture/新建训练任务页面.png) + + +表1 创建训练作业界面参数说明 + +| 参数名称 | 说明 | +| ----------------- | ----------- | +| 代码分支 | 选择仓库代码中要使用的代码分支,默认可选择master分支。 | +| AI引擎 | AI引擎选择[Ascend-Powered-Engine]和所需的MindSpore版本(本示例图片为 [Mindspore-1.3.0-python3.7-aarch64],请注意使用与所选版本对应的脚本)。 | +| 启动文件 | 启动文件选择代码目录下的启动脚本。 | +| 数据集 | 数据集选择已上传到启智平台的数据集。 | +| 运行参数 | 数据存储位置和训练输出位置分别对应运行参数data_url和train_url,选择增加运行参数可以向脚本中其他参数传值,如epoch_size。在这里只需填入其他参数传值,data_url和train_url已默认加入运行参数,用户无需重复指定,只需在代码中指定。 | +| 资源池 | 规格选择[Ascend: 1 * Ascend 910 CPU:24 核 256GiB],表示单机单卡 | + +注:若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend,否则默认是CPU,如下图所示 +![avatar](Example_Picture/运行参数界面.png) +## 4 查看运行结果 +### 4.1 在训练作业界面可以查看运行日志 +![avatar](Example_Picture/查看日志页面.png) +### 4.2 训练结束后可以下载模型文件 +![avatar](Example_Picture/模型下载页面.png) \ No newline at end of file diff --git a/npu/config.py b/npu/config.py new file mode 100755 index 0000000..22d68e2 --- /dev/null +++ b/npu/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 150, + 'air_name': "lenet", +}) diff --git a/npu/dataset.py b/npu/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/npu/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/dataset_distributed.py b/npu/dataset_distributed.py new file mode 100755 index 0000000..d813078 --- /dev/null +++ b/npu/dataset_distributed.py @@ -0,0 +1,55 @@ + +""" +Produce the dataset: +与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: +get_rank:获取当前设备在集群中的ID。 +get_group_size:获取集群数量。 + +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype +from mindspore.communication.management import init, get_rank, get_group_size + + +def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1, shard_id=0, num_shards=8): + """ + create dataset for train or test + """ + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. + shard_id = get_rank() + num_shards = get_group_size() + # define dataset + mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/inference.py b/npu/inference.py new file mode 100755 index 0000000..f0501e9 --- /dev/null +++ b/npu/inference.py @@ -0,0 +1,202 @@ +""" +######################## single-dataset inference lenet example ######################## +This example is a single-dataset inference tutorial. + +######################## Instructions for using the inference environment ######################## +The image of the debugging environment and the image of the inference environment are two different images, +and the working local directories are different. In the inference task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset inference in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + (2)The dataset structure of the single dataset in the inference image in this example + workroot + ├── data + | ├── test + | └── train + +2、Inference task requires predefined functions +(1)Defines whether the task is a inference environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The inference task uses this parameter to represent the local path of the inference image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy single dataset from obs to inference image. +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return + +(3)Copy ckpt file from obs to inference image. +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url, + ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_ckpt_url, ckpt_url) + str(e)) + return + +(4)Copy the output result to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +3、4 parameters need to be defined. +--data_url is the dataset you selected on the Qizhi platform +--ckpt_url is the weight file you choose on the Qizhi platform + +--data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +Inference task uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. +""" + +import os +import argparse +import moxing as mox +import mindspore.nn as nn +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import Tensor +import numpy as np +from glob import glob +from dataset import create_dataset +from config import mnist_cfg as cfg +from lenet import LeNet5 + +### Defines whether the task is a inference environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy single dataset from obs to inference image ### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy ckpt file from obs to inference image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return +### Copy the output result to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + type=str, + default= WorkEnvironment('train') + '/data/', + help='path where the dataset is saved') +parser.add_argument('--ckpt_url', + help='model to save/load', + default= WorkEnvironment('train') + '/checkpoint.ckpt') +parser.add_argument('--result_url', + help='result folder to save/load', + default= WorkEnvironment('train') + '/result/') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], + help='device where the code will be implemented (default: Ascend)') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + ### defining the training environment + environment = 'train' + workroot = WorkEnvironment(environment) + + ###Initialize the data and result directories in the inference image### + data_dir = workroot + '/data' + result_dir = workroot + '/result' + ckpt_url = workroot + '/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + ###Copy dataset from obs to inference image + obs_data_url = args.data_url + ObsToEnv(obs_data_url, data_dir) + + ###Copy ckpt file from obs to inference image + obs_ckpt_url = args.ckpt_url + ObsUrlToEnv(obs_ckpt_url, ckpt_url) + + ###Set output path result_url + obs_result_url = args.result_url + + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + repeat_size = cfg.epoch_size + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + + print("============== Starting Testing ==============") + + param_dict = load_checkpoint(os.path.join(ckpt_url)) + load_param_into_net(network, param_dict) + ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator() + data = next(ds_test) + images = data["image"].asnumpy() + labels = data["label"].asnumpy() + print('Tensor:', Tensor(data['image'])) + output = model.predict(Tensor(data['image'])) + predicted = np.argmax(output.asnumpy(), axis=1) + pred = np.argmax(output.asnumpy(), axis=1) + print('predicted:', predicted) + print('pred:', pred) + + print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') + filename = 'result.txt' + file_path = os.path.join(result_dir, filename) + with open(file_path, 'a+') as file: + file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) + + ###Copy result data from the local running environment back to obs, + ###and download it in the inference task corresponding to the Qizhi platform + EnvToObs(result_dir, obs_result_url) \ No newline at end of file diff --git a/npu/lenet.py b/npu/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/npu/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/npu/train.py b/npu/train.py new file mode 100755 index 0000000..42d09fe --- /dev/null +++ b/npu/train.py @@ -0,0 +1,193 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + (2)The dataset structure of the single dataset in the training image in this example + workroot + ├── data + | ├── test + | └── train + +2、Single dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy single dataset from obs to training image. +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return + +(3)Copy the output model to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy the output model to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + ### defining the training environment + environment = 'train' + workroot = WorkEnvironment(environment) + + ###Initialize the data and model directories in the training image### + data_dir = workroot + '/data' + train_dir = workroot + '/model' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + ### Copy the dataset from obs to the training image ### + ObsToEnv(args.data_url,data_dir) + + ###Specifies the device CPU or Ascend NPU used for training### + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) + diff --git a/npu/train_dataparallel.py b/npu/train_dataparallel.py new file mode 100755 index 0000000..19ba828 --- /dev/null +++ b/npu/train_dataparallel.py @@ -0,0 +1,205 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + (2)The dataset structure of the single dataset in the training image in this example + workroot + ├── data + | ├── test + | └── train + +2、Single dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy single dataset from obs to training image. +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return + +(3)Copy the output model to obs. +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +""" + +import os +import argparse +from dataset_distributed import create_dataset_parallel +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.common import set_seed +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank, get_group_size +import mindspore.ops as ops + + +# set device_id and init +device_id = int(os.getenv('DEVICE_ID')) +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") +context.set_context(device_id=device_id) +init() + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy the output model to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') +set_seed(114514) +if __name__ == "__main__": + args = parser.parse_args() + ### defining the training environment + environment = 'train' + workroot = WorkEnvironment(environment) + + ###Initialize the data and model directories in the training image### + data_dir = workroot + '/data' + train_dir = workroot + '/model' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + ### Copy the dataset from obs to the training image ### + ObsToEnv(args.data_url,data_dir) + + context.reset_auto_parallel_context() + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In the example, get_rank() is added to distinguish different paths. + ckpoint_cb = ModelCheckpoint(prefix="data_parallel", + directory=train_dir + "/" + str(get_rank()) + "/", + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()], dataset_sink_mode=True) + + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) + diff --git a/npu/train_for_c2net.py b/npu/train_for_c2net.py new file mode 100755 index 0000000..c8cd10a --- /dev/null +++ b/npu/train_for_c2net.py @@ -0,0 +1,92 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) + +The training of the intelligent computing network currently supports single dataset training, and does not require +the obs copy process.It only needs to define two parameters and then call it directly: + train_dir = '/cache/output' #The location of the output + data_dir = '/cache/dataset' #The location of the dataset +""" +#!/usr/bin/python +#coding=utf-8 + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +set_seed(1) + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + print('args:') + print(args) + + ###define two parameters and then call it directly### + train_dir = '/cache/output' + data_dir = '/cache/dataset' + + ###Specifies the device CPU or Ascend NPU used for training### + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + print("============== Finish Training ==============") \ No newline at end of file diff --git a/npu/train_for_multidataset.py b/npu/train_for_multidataset.py new file mode 100755 index 0000000..0e08815 --- /dev/null +++ b/npu/train_for_multidataset.py @@ -0,0 +1,237 @@ +""" +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Defines whether the task is a training environment or a debugging environment. +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image + elif environment == 'debug': + workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +(2)Copy multiple datasets from obs to training image +def MultiObsToEnv(multi_data_url, workroot): + multi_data_json = json.loads(multi_data_url) #Parse multi_data_url + for i in range(len(multi_data_json)): + path = workroot + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], + path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return + +***The input and output of the MultiObsToEnv function in this example: + Input for multi_data_url: + [ + { + "dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e + ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset + "dataset_name": "MNIST_Data" #the name of the dataset + }, + { + "dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c + 59be66-64ec-41ca-b311-f51a486eabf8/", + "dataset_name": "checkpoint_lenet-1_1875" + } + ] + Purpose of multi_data_url: + The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image + and build the dataset path in the training image. + For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData, + The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875 + +(3)Copy the output model to obs. +def EnvToObs(obs_train_url, train_dir): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + +3、4 parameters need to be defined +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +workroot + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed +from mindspore import load_checkpoint, load_param_into_net + +### Defines whether the task is a training environment or a debugging environment ### +def WorkEnvironment(environment): + if environment == 'train': + workroot = '/home/work/user-job-dir' + elif environment == 'debug': + workroot = '/home/ma-user/work' + print('current work mode:' + environment + ', workroot:' + workroot) + return workroot + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, workroot): + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = workroot + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], + path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return +### Copy the output model to obs ### +def EnvToObs(obs_train_url, train_dir): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= WorkEnvironment('train') + '/data/') + +parser.add_argument('--multi_data_url', + help='path to multi dataset', + default= WorkEnvironment('train')) + +parser.add_argument('--train_url', + help='model folder to save/load', + default= WorkEnvironment('train') + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + # After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to + # copy multiple datasets from obs to the training image + environment = 'train' + workroot = WorkEnvironment(environment) + MultiObsToEnv(args.multi_data_url, workroot) + + ### Define the output path in the training image + train_dir = workroot + '/model' + if not os.path.exists(train_dir): + os.makedirs(train_dir) + + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #The dataset path is used here:workroot + "/MNIST_Data" +"/train" "" + ds_train = create_dataset(os.path.join(workroot + "/MNISTData", "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + ### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" + load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875", + "checkpoint_lenet-1_1875.ckpt"))) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + ###Copy the trained model data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + EnvToObs(train_dir, args.train_url) + diff --git a/npu_test_model_4wxt_0.0.1.zip b/npu_test_model_4wxt_0.0.1.zip new file mode 100755 index 0000000..2c5df2f Binary files /dev/null and b/npu_test_model_4wxt_0.0.1.zip differ diff --git a/running.py b/running.py new file mode 100755 index 0000000..70f957c --- /dev/null +++ b/running.py @@ -0,0 +1,16 @@ +#!/usr/bin/python +#-*- coding: UTF-8 -*- +import time +import datetime + +timeStart = datetime.datetime.now() +print(timeStart.strftime('%Y-%m-%d %H:%M:%S')) +for letter in 'Python': + print('当前字母:%s' % letter) + print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + time.sleep(30) + + +timeEnd = datetime.datetime.now() +print(timeEnd.strftime('%Y-%m-%d %H:%M:%S')) +print('经历多少秒:%s' % (timeEnd - timeStart).seconds) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100755 index 0000000..2621bc3 --- /dev/null +++ b/test.py @@ -0,0 +1 @@ +print('for test only') \ No newline at end of file diff --git a/test_c2net_npu.py b/test_c2net_npu.py new file mode 100755 index 0000000..1197b1a --- /dev/null +++ b/test_c2net_npu.py @@ -0,0 +1,93 @@ +#!/usr/bin/python +#coding=utf-8 + +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +set_seed(1) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + # train_dir = '/tmp/output' + # data_dir = '/tmp/dataset' + train_dir = '/cache/output' + data_dir = '/cache/dataset' + + #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #创建数据集 + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + #创建网络 + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #定义模型输出路径 + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + #开始训练 + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + print("============== Finish Training ==============") \ No newline at end of file diff --git a/tf_train.py b/tf_train.py new file mode 100755 index 0000000..31d884d --- /dev/null +++ b/tf_train.py @@ -0,0 +1,158 @@ +# coding: utf-8 +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data +import os +import argparse +import moxing as mox + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +workroot = '/home/work/user-job-dir' +#初始化过滤器 +def weight_variable(shape): + return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) + +#初始化偏置,初始化时,所有值是0.1 +def bias_variable(shape): + return tf.Variable(tf.constant(0.1, shape=shape)) + +#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 +#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 +def conv2d(x, W): + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") + + +#池化运算 +def max_pool_2x2(x): + + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") + +def parse_args(): + parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +# define 2 parameters for running on modelArts +# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 + parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= workroot + '/data/') + + parser.add_argument('--train_url', + help='model folder to save/load', + default= workroot + '/model/') + parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +#modelarts已经默认使用data_url和train_url + parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = parse_args() + print('args:') + print(args) + + mnist = input_data.read_data_sets('mnist_data', one_hot=True) + + #创建x占位符,用于临时存放MNIST图片的数据, + # [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) + x = tf.placeholder(tf.float32, [None, 784], name='input') + #y_存的是实际图像的标签,即对应于每张输入图片实际的值 + y_ = tf.placeholder(tf.float32, [None, 10]) + + #将图片从784维向量重新还原为28×28的矩阵图片, + # 原因参考卷积神经网络模型图,最后一个参数代表深度, + # 因为MNIST是黑白图片,所以深度为1, + # 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 + x_image = tf.reshape(x, [-1, 28, 28, 1]) + + #第一层卷积 + #将过滤器设置成5×5×1的矩阵, + #其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 + #32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 + W_conv1 = weight_variable([5, 5, 1, 32]) + #有多少个特征图就有多少个偏置 + b_conv1 = bias_variable([32]) + #使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 + h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) + #卷积以后再经过池化操作 + h_pool1 = max_pool_2x2(h_conv1) + + #第二层卷积 + #因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 + W_conv2 = weight_variable([5, 5, 32, 64]) + b_conv2 = bias_variable([64]) + h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) + h_pool2 = max_pool_2x2(h_conv2) + + #全连接层 + #经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), + #第二层池化后输出为(14/2)×(14/2)),深度为64, + #我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] + W_fc1 = weight_variable([7 * 7 * 64, 1024]) + #偏置的个数和权重的个数一致 + b_fc1 = bias_variable([1024]) + #这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) + h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) + #使用ReLU激活函数 + h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) + + #dropout + #为了减少过拟合,我们在输出层之前加入dropout + keep_prob = tf.placeholder(tf.float32, name='keep_prob') + h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) + + #输出层 + #全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), + # 所以这里权重W的尺寸为[1024, 10] + W_fc2 = weight_variable([1024, 10]) + b_fc2 = bias_variable([10]) + + #最后都要经过Softmax函数将输出转化为概率问题 + y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') + + #损失函数和损失优化 + cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) + train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) + + #测试准确率,跟Softmax回归模型的一样 + correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) + accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + + train_dir = workroot + '/model/' #模型存放路径 + if not os.path.exists(train_dir): + os.mkdir(train_dir) + obs_train_url = args.train_url + #开始训练 + with tf.Session() as sess: + #初始化所有变量 + sess.run(tf.global_variables_initializer()) + #训练两万次 + for i in range(2000): + #每次获取50张图片数据和对应的标签 + batch = mnist.train.next_batch(50) + #每训练100次,我们打印一次训练的准确率 + if i % 100 == 0: + train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) + print("step %d, training accuracy %g" % (i, train_accuracy)) + #这里是真的训练,将数据传入 + sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) + + # 用SavedModel的方式保存 + tf.compat.v1.saved_model.simple_save(sess, + train_dir +"saved_model", + inputs={"input": x, 'keep_prob':keep_prob}, + outputs={"output": y_conv}) + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) diff --git a/tf_train_new.py b/tf_train_new.py new file mode 100755 index 0000000..119cec1 --- /dev/null +++ b/tf_train_new.py @@ -0,0 +1,146 @@ +# coding: utf-8 +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data +import os + +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' + +mnist = input_data.read_data_sets('mnist_data', one_hot=True) + +#初始化过滤器 +def weight_variable(shape): + return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) + +#初始化偏置,初始化时,所有值是0.1 +def bias_variable(shape): + return tf.Variable(tf.constant(0.1, shape=shape)) + +#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 +#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 +def conv2d(x, W): + return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") + + +#池化运算 +def max_pool_2x2(x): + + return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") + +#创建x占位符,用于临时存放MNIST图片的数据, +# [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) +x = tf.placeholder(tf.float32, [None, 784], name='input') +#y_存的是实际图像的标签,即对应于每张输入图片实际的值 +y_ = tf.placeholder(tf.float32, [None, 10]) + +#将图片从784维向量重新还原为28×28的矩阵图片, +# 原因参考卷积神经网络模型图,最后一个参数代表深度, +# 因为MNIST是黑白图片,所以深度为1, +# 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 +x_image = tf.reshape(x, [-1, 28, 28, 1]) + +#第一层卷积 +#将过滤器设置成5×5×1的矩阵, +#其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 +#32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 +W_conv1 = weight_variable([5, 5, 1, 32]) +#有多少个特征图就有多少个偏置 +b_conv1 = bias_variable([32]) +#使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 +h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) +#卷积以后再经过池化操作 +h_pool1 = max_pool_2x2(h_conv1) + +#第二层卷积 +#因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 +W_conv2 = weight_variable([5, 5, 32, 64]) +b_conv2 = bias_variable([64]) +h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) +h_pool2 = max_pool_2x2(h_conv2) + +#全连接层 +#经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), +#第二层池化后输出为(14/2)×(14/2)),深度为64, +#我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] +W_fc1 = weight_variable([7 * 7 * 64, 1024]) +#偏置的个数和权重的个数一致 +b_fc1 = bias_variable([1024]) +#这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) +h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) +#使用ReLU激活函数 +h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) + +#dropout +#为了减少过拟合,我们在输出层之前加入dropout +keep_prob = tf.placeholder(tf.float32, name='keep_prob') +h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) + +#输出层 +#全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), +# 所以这里权重W的尺寸为[1024, 10] +W_fc2 = weight_variable([1024, 10]) +b_fc2 = bias_variable([10]) + +#最后都要经过Softmax函数将输出转化为概率问题 +y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') + +#损失函数和损失优化 +cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) +train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) + +#测试准确率,跟Softmax回归模型的一样 +correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) +accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) + +# #将训练结果保存,如果不保存我们这次训练结束后的结果也随着程序运行结束而释放了 +# savePath = './mnist_conv/' +# saveFile = savePath + 'mnist_conv.ckpt' +# if os.path.exists(savePath) == False: +# os.mkdir(savePath) + +# saver = tf.train.Saver() + + +#开始训练 +with tf.Session() as sess: + + #初始化所有变量 + sess.run(tf.global_variables_initializer()) + + #训练两万次 + for i in range(2000): + + #每次获取50张图片数据和对应的标签 + batch = mnist.train.next_batch(50) + + #每训练100次,我们打印一次训练的准确率 + if i % 100 == 0: + train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) + print("step %d, training accuracy %g" % (i, train_accuracy)) + + #这里是真的训练,将数据传入 + sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) + + + + # print ("end train, start testing...") + # mean_value = 0.0 + # for i in range(mnist.test.labels.shape[0]): + # batch = mnist.test.next_batch(50) + # train_accuracy = sess.run(accuracy, feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}) + # mean_value += train_accuracy + + + + # print("test accuracy %g" % (mean_value / mnist.test.labels.shape[0])) + # #训练结束后,我们使用mnist.test在测试最后的准确率 + # print("test accuracy %g" % sess.run(accuracy, feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0})) + + + # 最后,将会话保存下来 + # saver.save(sess, saveFile) + + # 用SavedModel的方式保存 + tf.compat.v1.saved_model.simple_save(sess, + "./saved_model", + inputs={"input": x, 'keep_prob':keep_prob}, + outputs={"output": y_conv}) diff --git a/train.py b/train.py new file mode 100755 index 0000000..4b8ba9b --- /dev/null +++ b/train.py @@ -0,0 +1,185 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +""" +######################## 训练环境使用说明 ######################## +假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作: +1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换 +在调试环境中: +args.data_url = '/home/ma-user/work/data/' //数据集位置 +args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置 +在训练环境变换为: +args.data_url = '/home/work/user-job-dir/data/' +args.train_url = '/home/work/user-job-dir/model/' +2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs. +将数据集从obs拷贝到训练镜像中: + + obs_data_url = args.data_url + args.data_url = '/home/work/user-job-dir/data/' + if not os.path.exists(args.data_url): + os.mkdir(args.data_url) + try: + mox.file.copy_parallel(obs_data_url, args.data_url) + print("Successfully Download {} to {}".format(obs_data_url, + args.data_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_data_url, args.data_url) + str(e)) + +将输出的模型拷贝到obs: + obs_train_url = args.train_url + args.train_url = '/home/work/user-job-dir/model/' + if not os.path.exists(args.train_url): + os.mkdir(args.train_url) +try: + mox.file.copy_parallel(args.train_url, obs_train_url) + print("Successfully Upload {} to {}".format(args.train_url, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(args.train_url, + obs_train_url) + str(e)) + +""" + +import os +import numpy as np +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed +from mindspore import Tensor, export + +#配置默认的工作空间根目录 +# environment = 'debug' +environment = 'train' +if environment == 'debug': + workroot = '/home/ma-user/work' #调试任务使用该参数 +else: + workroot = '/home/work/user-job-dir' # 训练任务使用该参数 +print('current work mode:' + environment + ', workroot:' + workroot) + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +# define 2 parameters for running on modelArts +# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= workroot + '/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= workroot + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +#modelarts已经默认使用data_url和train_url +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +set_seed(1) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + data_dir = workroot + '/data' #数据集存放路径 + train_dir = workroot + '/model' #模型存放路径 + #初始化数据存放目录 + if not os.path.exists(data_dir): + os.mkdir(data_dir) + #初始化模型存放目录 + obs_train_url = args.train_url + train_dir = workroot + '/model/' + if not os.path.exists(train_dir): + os.mkdir(train_dir) + ######################## 将数据集从obs拷贝到训练镜像中 (固定写法)######################## + # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录 + #创建数据存放的位置 + if environment == 'train': + obs_data_url = args.data_url + #将数据拷贝到训练环境 + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, + data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_data_url, data_dir) + str(e)) +######################## 将数据集从obs拷贝到训练镜像中 ######################## + + #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #创建数据集 + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + #创建网络 + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #定义模型输出路径 + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + #开始训练 + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + input = np.random.uniform(0.0, 1.0, size=[1, 1, 32, 32]).astype(np.float32) + export(network, Tensor(input), file_name=(train_dir +'LeNet5_model'), file_format='MINDIR') + + export(network, Tensor(input), file_name=(train_dir +'LeNet5_onnx_model'), file_format='ONNX') + ######################## 将输出的模型拷贝到obs(固定写法) ######################## + # 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载 + if environment == 'train': + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + ######################## 将输出的模型拷贝到obs ######################## diff --git a/train_longparam.py b/train_longparam.py new file mode 100755 index 0000000..7546110 --- /dev/null +++ b/train_longparam.py @@ -0,0 +1,207 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +""" +######################## 训练环境使用说明 ######################## +假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作: +1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换 +在调试环境中: +args.data_url = '/home/ma-user/work/data/' //数据集位置 +args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置 +在训练环境变换为: +args.data_url = '/home/work/user-job-dir/data/' +args.train_url = '/home/work/user-job-dir/model/' +2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs. +将数据集从obs拷贝到训练镜像中: + + obs_data_url = args.data_url + args.data_url = '/home/work/user-job-dir/data/' + if not os.path.exists(args.data_url): + os.mkdir(args.data_url) + try: + mox.file.copy_parallel(obs_data_url, args.data_url) + print("Successfully Download {} to {}".format(obs_data_url, + args.data_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_data_url, args.data_url) + str(e)) + +将输出的模型拷贝到obs: + obs_train_url = args.train_url + args.train_url = '/home/work/user-job-dir/model/' + if not os.path.exists(args.train_url): + os.mkdir(args.train_url) +try: + mox.file.copy_parallel(args.train_url, obs_train_url) + print("Successfully Upload {} to {}".format(args.train_url, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(args.train_url, + obs_train_url) + str(e)) + +""" + +import os +import numpy as np +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed +from mindspore import Tensor, export + +#配置默认的工作空间根目录 +# environment = 'debug' +environment = 'train' +if environment == 'debug': + workroot = '/home/ma-user/work' #调试任务使用该参数 +else: + workroot = '/home/work/user-job-dir' # 训练任务使用该参数 +print('current work mode:' + environment + ', workroot:' + workroot) + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +# define 2 parameters for running on modelArts +# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= workroot + '/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= workroot + '/model/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +#modelarts已经默认使用data_url和train_url +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +parser.add_argument('--openI', + help='model folder to save/load', + default= True) +parser.add_argument('--sink_mode', + help='model folder to save/load', + default= True) +parser.add_argument('--dataset', + help='model folder to save/load', + default= 'hmdb51') +parser.add_argument('--checkpoint_path', + help='model folder to save/load', + default= './src/pretrained/rgb_imagenet.ckpt') +parser.add_argument('--mode', + help='model folder to save/load', + default= 'rgb') +parser.add_argument('--num_epochs', + help='model folder to save/load', + default= 40) +parser.add_argument('--distributed', + help='model folder to save/load', + default= True) + +set_seed(1) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + data_dir = workroot + '/data' #数据集存放路径 + train_dir = workroot + '/model' #模型存放路径 + #初始化数据存放目录 + if not os.path.exists(data_dir): + os.mkdir(data_dir) + #初始化模型存放目录 + obs_train_url = args.train_url + train_dir = workroot + '/model/' + if not os.path.exists(train_dir): + os.mkdir(train_dir) + ######################## 将数据集从obs拷贝到训练镜像中 (固定写法)######################## + # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录 + #创建数据存放的位置 + if environment == 'train': + obs_data_url = args.data_url + #将数据拷贝到训练环境 + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, + data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + obs_data_url, data_dir) + str(e)) +######################## 将数据集从obs拷贝到训练镜像中 ######################## + + #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #创建数据集 + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + #创建网络 + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #定义模型输出路径 + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + #开始训练 + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + input = np.random.uniform(0.0, 1.0, size=[1, 1, 32, 32]).astype(np.float32) + export(network, Tensor(input), file_name=(train_dir +'LeNet5_model'), file_format='MINDIR') + + export(network, Tensor(input), file_name=(train_dir +'LeNet5_onnx_model'), file_format='ONNX') + ######################## 将输出的模型拷贝到obs(固定写法) ######################## + # 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载 + if environment == 'train': + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + ######################## 将输出的模型拷贝到obs ########################