Browse Source

add scripts

test_95%
wjtest001 2 years ago
parent
commit
651ff83151
44 changed files with 3144 additions and 0 deletions
  1. BIN
      Example_Picture/数据集上传位置.png
  2. BIN
      Example_Picture/新建训练任务页面.png
  3. BIN
      Example_Picture/查看日志页面.png
  4. BIN
      Example_Picture/模型下载页面.png
  5. BIN
      Example_Picture/运行参数界面.png
  6. +33
    -0
      config.py
  7. +59
    -0
      convert_pytorch.py
  8. +22
    -0
      convert_to_onnx.py
  9. +60
    -0
      dataset.py
  10. +73
    -0
      gpu/inference.py
  11. +35
    -0
      gpu/model.py
  12. +86
    -0
      gpu/train.py
  13. +71
    -0
      gpu/train_for_c2net.py
  14. +108
    -0
      gpu/train_for_multidataset.py
  15. +74
    -0
      gpu_train.py
  16. +30
    -0
      gpu_train_resnet50.py
  17. +154
    -0
      grampus_tf_train.py
  18. +74
    -0
      grampus_train.py
  19. +156
    -0
      inference.py
  20. +60
    -0
      lenet.py
  21. +35
    -0
      model.py
  22. BIN
      npu/Example_Picture/数据集上传位置.png
  23. BIN
      npu/Example_Picture/新建训练任务页面.png
  24. BIN
      npu/Example_Picture/查看日志页面.png
  25. BIN
      npu/Example_Picture/模型下载页面.png
  26. BIN
      npu/Example_Picture/运行参数界面.png
  27. +71
    -0
      npu/README.md
  28. +33
    -0
      npu/config.py
  29. +60
    -0
      npu/dataset.py
  30. +55
    -0
      npu/dataset_distributed.py
  31. +202
    -0
      npu/inference.py
  32. +60
    -0
      npu/lenet.py
  33. +193
    -0
      npu/train.py
  34. +205
    -0
      npu/train_dataparallel.py
  35. +92
    -0
      npu/train_for_c2net.py
  36. +237
    -0
      npu/train_for_multidataset.py
  37. BIN
      npu_test_model_4wxt_0.0.1.zip
  38. +16
    -0
      running.py
  39. +1
    -0
      test.py
  40. +93
    -0
      test_c2net_npu.py
  41. +158
    -0
      tf_train.py
  42. +146
    -0
      tf_train_new.py
  43. +185
    -0
      train.py
  44. +207
    -0
      train_longparam.py

BIN
Example_Picture/数据集上传位置.png View File

Before After
Width: 2304  |  Height: 1098  |  Size: 189 kB

BIN
Example_Picture/新建训练任务页面.png View File

Before After
Width: 1724  |  Height: 1444  |  Size: 73 kB

BIN
Example_Picture/查看日志页面.png View File

Before After
Width: 1809  |  Height: 788  |  Size: 67 kB

BIN
Example_Picture/模型下载页面.png View File

Before After
Width: 1819  |  Height: 776  |  Size: 73 kB

BIN
Example_Picture/运行参数界面.png View File

Before After
Width: 2146  |  Height: 1294  |  Size: 108 kB

+ 33
- 0
config.py View File

@@ -0,0 +1,33 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py
"""

from easydict import EasyDict as edict

mnist_cfg = edict({
'num_classes': 10,
'lr': 0.01,
'momentum': 0.9,
'epoch_size': 10,
'batch_size': 32,
'buffer_size': 1000,
'image_height': 32,
'image_width': 32,
'save_checkpoint_steps': 1875,
'keep_checkpoint_max': 10,
'air_name': "lenet",
})

+ 59
- 0
convert_pytorch.py View File

@@ -0,0 +1,59 @@
import torchvision
import torch
import argparse
from torch.autograd import Variable
import onnx
print(torch.__version__)

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

parser.add_argument('--model',
type=str,
help='path to training/inference dataset folder'
)
parser.add_argument('--n',
type=int,
default=256,
help='batch size for input shape type'
)
parser.add_argument('--c',
type=int,
default=1,
help='channel for input shape type'
)
parser.add_argument('--h',
type=int,
default=28,
help='height for input shape type'
)
parser.add_argument('--w',
type=int,
default=28,
help='width for input shape type'
)
if __name__ == "__main__":
args = parser.parse_args()
print('args:')
print(args)

model_file = '/dataset/' + args.model
print(model_file)
model = torch.load(model_file)
print(model)
print(type(model))
for k, v in model.named_parameters():
print("k:",k)
print("v:",v.shape)

suffix = args.model.rindex(".")
out_file = '/model/' + args.model + ".onnx"
if suffix!=-1 :
out_file = '/model/' + args.model[0:suffix] + ".onnx"
print(out_file)
input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(args.n, args.c, args.h, args.w))
torch.onnx.export(model, input, out_file, input_names=input_name, output_names=output_name, verbose=True)



+ 22
- 0
convert_to_onnx.py View File

@@ -0,0 +1,22 @@
import numpy as np
from mindspore import Tensor, export, load_checkpoint
from mindvision.classification.models import resnet50
from mindvision.dataset import DownLoad

# 下载Resnet50的预训练模型
dl = DownLoad()
dl.download_url('https://download.mindspore.cn/vision/classification/resnet50_224.ckpt')

resnet = resnet50(1000)
load_checkpoint("resnet50_224.ckpt", net=resnet)

input_np = np.random.uniform(0.0, 1.0, size=[1, 3, 224, 224]).astype(np.float32)

# 导出文件resnet50_224.mindir到当前文件夹
export(resnet, Tensor(input_np), file_name='resnet50_224', file_format='MINDIR')



# 保存resnet50_224.onnx文件到当前目录下
export(resnet, Tensor(input_np), file_name='resnet50_224', file_format='ONNX')


+ 60
- 0
dataset.py View File

@@ -0,0 +1,60 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Produce the dataset
"""

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.vision import Inter
from mindspore.common import dtype as mstype


def create_dataset(data_path, batch_size=32, repeat_size=1,
num_parallel_workers=1):
"""
create dataset for train or test
"""
# define dataset
mnist_ds = ds.MnistDataset(data_path)

resize_height, resize_width = 32, 32
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081

# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)

# apply map operations on images
mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)

# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)

return mnist_ds

+ 73
- 0
gpu/inference.py View File

@@ -0,0 +1,73 @@
#!/usr/bin/python
#coding=utf-8
'''
GPU INFERENCE INSTANCE

If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8
Due to the adaptability of a100, please use the recommended image of the
platform with cuda 11.Then adjust the code and submit the image.
The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
In the environment, the uploaded dataset will be automatically placed in the /dataset directory.
if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/test;

The model file selected is in /model directory.
The result download path is under /result . and the Qizhi platform will provide file downloads under the /result directory.
由于a100的适配性,请使用含cuda 11的平台镜像.
本例中的镜像是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
选择的数据集被放置在/dataset目录
选择的模型文件放置在/model目录
输出结果路径是/result目录

'''


import numpy as np
import torch
from torchvision.datasets import mnist
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import os
import argparse


# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#获取模型文件名称
parser.add_argument('--modelname', help='model name')



if __name__ == '__main__':
args, unknown = parser.parse_known_args()
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

test_dataset = mnist.MNIST(root='/dataset/test', train=False, transform=ToTensor(),
download=False)
test_loader = DataLoader(test_dataset, batch_size=256)
#如果文件名确定,model_path可以直接写死
model_path = '/model/'+args.modelname

model = torch.load(model_path).to(device)
model.eval()

correct = 0
_sum = 0

for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
print('accuracy: {:.2f}'.format(correct / _sum))
#结果写入/result
filename = 'result.txt'
file_path = os.path.join('/result', filename)
with open(file_path, 'w') as file:
file.write('accuracy: {:.2f}'.format(correct / _sum))

+ 35
- 0
gpu/model.py View File

@@ -0,0 +1,35 @@
from torch.nn import Module
from torch import nn


class Model(Module):
def __init__(self):
super(Model, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.relu1 = nn.ReLU()
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.relu2 = nn.ReLU()
self.pool2 = nn.MaxPool2d(2)
self.fc1 = nn.Linear(256, 120)
self.relu3 = nn.ReLU()
self.fc2 = nn.Linear(120, 84)
self.relu4 = nn.ReLU()
self.fc3 = nn.Linear(84, 10)
self.relu5 = nn.ReLU()

def forward(self, x):
y = self.conv1(x)
y = self.relu1(y)
y = self.pool1(y)
y = self.conv2(y)
y = self.relu2(y)
y = self.pool2(y)
y = y.view(y.shape[0], -1)
y = self.fc1(y)
y = self.relu3(y)
y = self.fc2(y)
y = self.relu4(y)
y = self.fc3(y)
y = self.relu5(y)
return y

+ 86
- 0
gpu/train.py View File

@@ -0,0 +1,86 @@
#!/usr/bin/python
#coding=utf-8
'''
If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

Due to the adaptability of a100, before using the training environment, please use the recommended image of the
platform with cuda 11.Then adjust the code and submit the image.
The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
In the training environment, the uploaded dataset will be automatically placed in the /dataset directory.
If it is a single dataset:
if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
If it is a multiple dataset:
If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected,
the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl

The model download path is under /model by default. Please specify the model output location to /model,
and the Qizhi platform will provide file downloads under the /model directory.
'''


from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#The dataset location is placed under /dataset
parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')

if __name__ == '__main__':
args, unknown = parser.parse_known_args()
#log output
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
sgd = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
epoch = args.epoch_size
print('epoch_size is:{}'.format(epoch))
for _epoch in range(epoch):
print('the {} epoch_size begin'.format(_epoch + 1))
model.train()
for idx, (train_x, train_label) in enumerate(train_loader):
train_x = train_x.to(device)
train_label = train_label.to(device)
label_np = np.zeros((train_label.shape[0], 10))
sgd.zero_grad()
predict_y = model(train_x.float())
loss = cost(predict_y, train_label.long())
if idx % 10 == 0:
print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
loss.backward()
sgd.step()

correct = 0
_sum = 0
model.eval()
for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
print('accuracy: {:.2f}'.format(correct / _sum))
#The model output location is placed under /model
torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))

+ 71
- 0
gpu/train_for_c2net.py View File

@@ -0,0 +1,71 @@
'''
在训练环境中,代码会自动放在/tmp/code目录下,上传的数据集会自动放在/tmp/dataset目录下,模型下载路径默认在/tmp/output下,请将模型输出位置指定到/tmp/model,
启智平台界面会提供/tmp/output目录下的文件下载。
'''


from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#数据集位置放在/tmp/dataset下
parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')

if __name__ == '__main__':
args = parser.parse_args()
#日志输出
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
sgd = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
epoch = args.epoch_size
#日志输出
print('epoch_size is:{}'.format(epoch))
for _epoch in range(epoch):
print('the {} epoch_size begin'.format(_epoch + 1))
model.train()
for idx, (train_x, train_label) in enumerate(train_loader):
train_x = train_x.to(device)
train_label = train_label.to(device)
label_np = np.zeros((train_label.shape[0], 10))
sgd.zero_grad()
predict_y = model(train_x.float())
loss = cost(predict_y, train_label.long())
if idx % 10 == 0:
print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
loss.backward()
sgd.step()

correct = 0
_sum = 0
model.eval()
for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
#日志输出
print('accuracy: {:.2f}'.format(correct / _sum))
#模型输出位置放在/tmp/output下
torch.save(model, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))

+ 108
- 0
gpu/train_for_multidataset.py View File

@@ -0,0 +1,108 @@
'''
1,本示例中多数据集训练上传的数据集结构
MnistDataset_torch.zip
├── test
└── train
checkpoint_epoch1_0.73.zip
├── mnist_epoch1_0.73.pkl
2,由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码,
本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并
提交镜像,再切到训练环境训练已跑通的代码。
在训练环境中,上传的数据集会自动放在/dataset目录下,注意:选择单数据集和多数据集时的路径不同!
(1)如果是单数据集:如选择的是MnistDataset_torch.zip,则数据集目录为/dataset/train、/dataset/test;
本示例中单数据集在训练镜像中的数据集结构
dataset
├── test
└── train
(2)如选择的是多数据集,如选择的是MnistDataset_torch.zip和checkpoint_epoch1_0.73.zip,则数据集
目录为/dataset/MnistDataset_torch/train、/dataset/MnistDataset_torch/test
和/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
本示例中多数据集在训练镜像中的数据集结构
dataset
├── MnistDataset_torch
| ├── test
| └── train
└── checkpoint_epoch1_0.73
├── mnist_epoch1_0.73.pkl
模型下载路径默认在/model下,请将模型输出位置指定到/model,启智平台界面会提供/model目录下的文件下载。
'''
from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#数据集位置放在/dataset下
parser.add_argument('--traindata', default="/dataset/MnistDataset_torch/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/dataset/MnistDataset_torch/test" ,help='path to test dataset')
parser.add_argument('--checkpoint', default="/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl" ,help='checkpoint file')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
if __name__ == '__main__':
args = parser.parse_args()
#日志输出
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
sgd = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
epoch = args.epoch_size
#日志输出
print('epoch_size is:{}'.format(epoch))
#加载已训练好的模型:
# path = args.checkpoint
# checkpoint = torch.load(path, map_location=device)
# model.load_state_dict(checkpoint)
#开始训练
for _epoch in range(epoch):
print('the {} epoch_size begin'.format(_epoch + 1))
model.train()
for idx, (train_x, train_label) in enumerate(train_loader):
train_x = train_x.to(device)
train_label = train_label.to(device)
label_np = np.zeros((train_label.shape[0], 10))
sgd.zero_grad()
predict_y = model(train_x.float())
loss = cost(predict_y, train_label.long())
if idx % 10 == 0:
print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
loss.backward()
sgd.step()
correct = 0
_sum = 0
model.eval()
for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
#日志输出
print('accuracy: {:.2f}'.format(correct / _sum))
#模型输出位置放在/model下
torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))

+ 74
- 0
gpu_train.py View File

@@ -0,0 +1,74 @@
'''
由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码,
本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并
提交镜像,再切到训练环境训练已跑通的代码。
在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model,
启智平台界面会提供/model目录下的文件下载。
'''
from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#数据集位置放在/dataset下
parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
if __name__ == '__main__':
args = parser.parse_args()
#日志输出
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
sgd = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
epoch = args.epoch_size
#日志输出
print('epoch_size is:{}'.format(epoch))
for _epoch in range(epoch):
print('the {} epoch_size begin'.format(_epoch + 1))
model.train()
for idx, (train_x, train_label) in enumerate(train_loader):
train_x = train_x.to(device)
train_label = train_label.to(device)
label_np = np.zeros((train_label.shape[0], 10))
sgd.zero_grad()
predict_y = model(train_x.float())
loss = cost(predict_y, train_label.long())
if idx % 10 == 0:
print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
loss.backward()
sgd.step()
correct = 0
_sum = 0
model.eval()
for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
#日志输出
print('accuracy: {:.2f}'.format(correct / _sum))
#模型输出位置放在/model下
torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))

+ 30
- 0
gpu_train_resnet50.py View File

@@ -0,0 +1,30 @@
'''
由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码,
本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并
提交镜像,再切到训练环境训练已跑通的代码。
在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model,
启智平台界面会提供/model目录下的文件下载。
'''
import torchvision
from torch.autograd import Variable
import torch
import argparse
# Training settings
parser = argparse.ArgumentParser(description='Resnet50 Example')
#数据集位置放在/dataset下
parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
if __name__ == '__main__':
input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(1, 3, 224, 224)).cuda()
model = torchvision.models.resnet50(pretrained=True).cuda()
#模型输出位置放在/model下
torch.save(model, '/model/resnet50.pth')

+ 154
- 0
grampus_tf_train.py View File

@@ -0,0 +1,154 @@
# coding: utf-8
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
import argparse
import moxing as mox

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
workroot = '/cache/'
#初始化过滤器
def weight_variable(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
#初始化偏置,初始化时,所有值是0.1
def bias_variable(shape):
return tf.Variable(tf.constant(0.1, shape=shape))
#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1
#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")
#池化运算
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

def parse_args():
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

# define 2 parameters for running on modelArts
# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= workroot + '/dataset/')

parser.add_argument('--train_url',
help='model folder to save/load',
default= workroot + '/output/')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')

#modelarts已经默认使用data_url和train_url
parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

args = parser.parse_args()
return args

if __name__ == "__main__":
args = parse_args()
print('args:')
print(args)

mnist = input_data.read_data_sets('mnist_data', one_hot=True)
#创建x占位符,用于临时存放MNIST图片的数据,
# [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784)
x = tf.placeholder(tf.float32, [None, 784], name='input')
#y_存的是实际图像的标签,即对应于每张输入图片实际的值
y_ = tf.placeholder(tf.float32, [None, 10])
#将图片从784维向量重新还原为28×28的矩阵图片,
# 原因参考卷积神经网络模型图,最后一个参数代表深度,
# 因为MNIST是黑白图片,所以深度为1,
# 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了
x_image = tf.reshape(x, [-1, 28, 28, 1])
#第一层卷积
#将过滤器设置成5×5×1的矩阵,
#其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1
#32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64
W_conv1 = weight_variable([5, 5, 1, 32])
#有多少个特征图就有多少个偏置
b_conv1 = bias_variable([32])
#使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
#卷积以后再经过池化操作
h_pool1 = max_pool_2x2(h_conv1)
#第二层卷积
#因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
#全连接层
#经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2),
#第二层池化后输出为(14/2)×(14/2)),深度为64,
#我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024]
W_fc1 = weight_variable([7 * 7 * 64, 1024])
#偏置的个数和权重的个数一致
b_fc1 = bias_variable([1024])
#这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
#使用ReLU激活函数
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
#dropout
#为了减少过拟合,我们在输出层之前加入dropout
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#输出层
#全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9),
# 所以这里权重W的尺寸为[1024, 10]
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
#最后都要经过Softmax函数将输出转化为概率问题
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output')
#损失函数和损失优化
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv)))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#测试准确率,跟Softmax回归模型的一样
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_dir = '/cache/output/' #模型存放路径
if not os.path.exists(train_dir):
os.mkdir(train_dir)
obs_train_url = args.train_url
#开始训练
with tf.Session() as sess:
#初始化所有变量
sess.run(tf.global_variables_initializer())
#训练两万次
for i in range(2000):
#每次获取50张图片数据和对应的标签
batch = mnist.train.next_batch(50)
#每训练100次,我们打印一次训练的准确率
if i % 100 == 0:
train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0})
print("step %d, training accuracy %g" % (i, train_accuracy))
#这里是真的训练,将数据传入
sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5})
# 用SavedModel的方式保存
tf.compat.v1.saved_model.simple_save(sess,
train_dir +"saved_model",
inputs={"input": x, 'keep_prob':keep_prob},
outputs={"output": y_conv})


+ 74
- 0
grampus_train.py View File

@@ -0,0 +1,74 @@
'''
由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码,
本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并
提交镜像,再切到训练环境训练已跑通的代码。
在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model,
启智平台界面会提供/model目录下的文件下载。
'''


from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
#数据集位置放在/dataset下
parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')

if __name__ == '__main__':
args = parser.parse_args()
#日志输出
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
sgd = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
epoch = args.epoch_size
#日志输出
print('epoch_size is:{}'.format(epoch))
for _epoch in range(epoch):
print('the {} epoch_size begin'.format(_epoch + 1))
model.train()
for idx, (train_x, train_label) in enumerate(train_loader):
train_x = train_x.to(device)
train_label = train_label.to(device)
label_np = np.zeros((train_label.shape[0], 10))
sgd.zero_grad()
predict_y = model(train_x.float())
loss = cost(predict_y, train_label.long())
if idx % 10 == 0:
print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
loss.backward()
sgd.step()

correct = 0
_sum = 0
model.eval()
for idx, (test_x, test_label) in enumerate(test_loader):
test_x = test_x
test_label = test_label
predict_y = model(test_x.to(device).float()).detach()
predict_ys = np.argmax(predict_y.cpu(), axis=-1)
label_np = test_label.numpy()
_ = predict_ys == test_label
correct += np.sum(_.numpy(), axis=-1)
_sum += _.shape[0]
#日志输出
print('accuracy: {:.2f}'.format(correct / _sum))
#模型输出位置放在/model下
torch.save(model, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))

+ 156
- 0
inference.py View File

@@ -0,0 +1,156 @@
"""
######################## inference lenet example ########################
inference lenet according to model file
"""

"""
######################## 推理环境使用说明 ########################
1、在推理环境中,需要将数据集从obs拷贝到推理镜像中,推理完以后,需要将输出的结果拷贝到obs.
(1)将数据集从obs拷贝到推理镜像中:
obs_data_url = args.data_url
args.data_url = '/home/work/user-job-dir/data/'
if not os.path.exists(args.data_url):
os.mkdir(args.data_url)
try:
mox.file.copy_parallel(obs_data_url, args.data_url)
print("Successfully Download {} to {}".format(obs_data_url,
args.data_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_data_url, args.data_url) + str(e))

(2)将模型文件从obs拷贝到推理镜像中:
obs_ckpt_url = args.ckpt_url
args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt'
try:
mox.file.copy(obs_ckpt_url, args.ckpt_url)
print("Successfully Download {} to {}".format(obs_ckpt_url,
args.ckpt_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_ckpt_url, args.ckpt_url) + str(e))

(3)将输出的结果拷贝回obs:
obs_result_url = args.result_url
args.result_url = '/home/work/user-job-dir/result/'
if not os.path.exists(args.result_url):
os.mkdir(args.result_url)
try:
mox.file.copy_parallel(args.result_url, obs_result_url)
print("Successfully Upload {} to {}".format(args.result_url, obs_result_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e))
详细代码可参考以下示例代码:
"""

import os
import argparse
import moxing as mox
import mindspore.nn as nn
from mindspore import context
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore import Tensor
import numpy as np
from glob import glob
from dataset import create_dataset
from config import mnist_cfg as cfg
from lenet import LeNet5

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
help='device where the code will be implemented (default: Ascend)')
parser.add_argument('--data_url',
type=str,
default="./Data",
help='path where the dataset is saved')
parser.add_argument('--ckpt_url',
help='model to save/load',
default='./ckpt_url')
parser.add_argument('--result_url',
help='result folder to save/load',
default='./result')

args = parser.parse_args()
#将数据集从obs拷贝到推理镜像中:
obs_data_url = args.data_url
args.data_url = '/home/work/user-job-dir/data/'
if not os.path.exists(args.data_url):
os.mkdir(args.data_url)
try:
mox.file.copy_parallel(obs_data_url, args.data_url)
print("Successfully Download {} to {}".format(obs_data_url,
args.data_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_data_url, args.data_url) + str(e))

#对文件夹进行操作,请使用mox.file.copy_parallel。如果拷贝一个文件。请使用mox.file.copy对文件操作,本次操作是对文件进行操作
#将模型文件从obs拷贝到推理镜像中:
obs_ckpt_url = args.ckpt_url
args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt'
try:
mox.file.copy(obs_ckpt_url, args.ckpt_url)
print("Successfully Download {} to {}".format(obs_ckpt_url,
args.ckpt_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_ckpt_url, args.ckpt_url) + str(e))

#设置输出路径result_url
obs_result_url = args.result_url
args.result_url = '/home/work/user-job-dir/result/'
if not os.path.exists(args.result_url):
os.mkdir(args.result_url)

args.dataset_path = args.data_url
args.save_checkpoint_path = args.ckpt_url

context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)

network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
repeat_size = cfg.epoch_size
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

print("============== Starting Testing ==============")
args.load_ckpt_url = os.path.join(args.save_checkpoint_path)
print("args.load_ckpt_url is:{}", args.load_ckpt_url )
param_dict = load_checkpoint(args.load_ckpt_url )
load_param_into_net(network, param_dict)
# 定义测试数据集,batch_size设置为1,则取出一张图片
ds_test = create_dataset(os.path.join(args.dataset_path, "test"), batch_size=1).create_dict_iterator()
data = next(ds_test)

# images为测试图片,labels为测试图片的实际分类
images = data["image"].asnumpy()
labels = data["label"].asnumpy()
print('Tensor:', Tensor(data['image']))

# 使用函数model.predict预测image对应分类
output = model.predict(Tensor(data['image']))
predicted = np.argmax(output.asnumpy(), axis=1)
pred = np.argmax(output.asnumpy(), axis=1)
print('predicted:', predicted)
print('pred:', pred)

# 输出预测分类与实际分类,并输出到result_url
print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"')
filename = 'result.txt'
file_path = os.path.join(args.result_url, filename)
with open(file_path, 'a+') as file:
file.write(" {}: {:.2f} \n".format("Predicted", predicted[0]))

# Upload results to obs
######################## 将输出的结果拷贝到obs(固定写法) ########################
# 把推理后的结果从本地的运行环境拷贝回obs,在启智平台相对应的推理任务中会提供下载
try:
mox.file.copy_parallel(args.result_url, obs_result_url)
print("Successfully Upload {} to {}".format(args.result_url, obs_result_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e))
######################## 将输出的模型拷贝到obs ########################

+ 60
- 0
lenet.py View File

@@ -0,0 +1,60 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""LeNet."""
import mindspore.nn as nn
from mindspore.common.initializer import Normal


class LeNet5(nn.Cell):
"""
Lenet network

Args:
num_class (int): Number of classes. Default: 10.
num_channel (int): Number of channels. Default: 1.

Returns:
Tensor, output tensor
Examples:
>>> LeNet(num_class=10)

"""
def __init__(self, num_class=10, num_channel=1, include_top=True):
super(LeNet5, self).__init__()
self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')
self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
self.relu = nn.ReLU()
self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
self.include_top = include_top
if self.include_top:
self.flatten = nn.Flatten()
self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02))
self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02))
self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02))

def construct(self, x):
x = self.conv1(x)
x = self.relu(x)
x = self.max_pool2d(x)
x = self.conv2(x)
x = self.relu(x)
x = self.max_pool2d(x)
if not self.include_top:
return x
x = self.flatten(x)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x

+ 35
- 0
model.py View File

@@ -0,0 +1,35 @@
from torch.nn import Module
from torch import nn
class Model(Module):
def __init__(self):
super(Model, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.relu1 = nn.ReLU()
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.relu2 = nn.ReLU()
self.pool2 = nn.MaxPool2d(2)
self.fc1 = nn.Linear(256, 120)
self.relu3 = nn.ReLU()
self.fc2 = nn.Linear(120, 84)
self.relu4 = nn.ReLU()
self.fc3 = nn.Linear(84, 10)
self.relu5 = nn.ReLU()
def forward(self, x):
y = self.conv1(x)
y = self.relu1(y)
y = self.pool1(y)
y = self.conv2(y)
y = self.relu2(y)
y = self.pool2(y)
y = y.view(y.shape[0], -1)
y = self.fc1(y)
y = self.relu3(y)
y = self.fc2(y)
y = self.relu4(y)
y = self.fc3(y)
y = self.relu5(y)
return y

BIN
npu/Example_Picture/数据集上传位置.png View File

Before After
Width: 2304  |  Height: 1098  |  Size: 189 kB

BIN
npu/Example_Picture/新建训练任务页面.png View File

Before After
Width: 1724  |  Height: 1444  |  Size: 73 kB

BIN
npu/Example_Picture/查看日志页面.png View File

Before After
Width: 1809  |  Height: 788  |  Size: 67 kB

BIN
npu/Example_Picture/模型下载页面.png View File

Before After
Width: 1819  |  Height: 776  |  Size: 73 kB

BIN
npu/Example_Picture/运行参数界面.png View File

Before After
Width: 2146  |  Height: 1294  |  Size: 108 kB

+ 71
- 0
npu/README.md View File

@@ -0,0 +1,71 @@

# 如何在启智平台上进行模型训练 - NPU版本

## 1 概述
- 本项目以LeNet-MNIST为例,简要介绍如何在启智AI协同平台上使用MindSpore完成训练任务,旨在为AI初学者提供云脑训练示例。
- 大家可以直接使用本项目提供的数据集和代码文件创建自己的训练任务。
- 启智平台对接ModelArts和OBS,将数据集,代码,训练资源池等整合在启智AI协同平台上供开发者使用。
- ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在ModelArts下体验MindSpore。
- OBS是华为云提供的存储方式。

## 2 准备工作
- 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://git.openi.org.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。

### 2.1 数据准备
#### 数据集下载
- 数据集可从本项目的数据集目录中下载,[数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/datasets?type=1)
- 数据文件说明
- MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。
- 数据集文件的目录结构如下:
> MNIST_Data
> ├── test
> │ ├── t10k-images-idx3-ubyte
> │ └── t10k-labels-idx1-ubyte
> └── train
> ├── train-images-idx3-ubyte
> └── train-labels-idx1-ubyte

#### 数据集上传
- 由于本示例使用的是Mindspore开发,需要在NPU芯片运行,所以上传的数据集需要传到NPU界面。\
【注意:如果你需要试运行本示例,则无需再次上传数据集,因为本示例中的数据集MNIST_Example已经设置为公开数据集,可以直接引用】
- 如下所示:
- ![avatar](Example_Picture/数据集上传位置.png)
### 2.2 执行脚本准备
#### 示例代码
- 示例代码可从本仓库中下载,[代码下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example)
- 代码文件说明
- [train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py),用于训练的脚本文件,包括将数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)的代码注释

- [inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py),用于推理的脚本文件。

- [config.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/config.py),网络配置信息,在train.py中会使用到。

- [dataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/dataset.py),对原始数据集进行预处理,产生可用于网络训练的数据集。
- [lenet.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/lenet.py),使用的训练网络,在train.py中会使用到。

## 3 创建训练任务
- 准备好数据和执行脚本以后,需要创建训练任务将MindSpore脚本真正运行起来。首次使用的用户可参考本示例代码。

### 使用MindSpore作为训练框架创建训练作业,界面截图如下图所示。
![avatar](Example_Picture/新建训练任务页面.png)


表1 创建训练作业界面参数说明

| 参数名称 | 说明 |
| ----------------- | ----------- |
| 代码分支 | 选择仓库代码中要使用的代码分支,默认可选择master分支。 |
| AI引擎 | AI引擎选择[Ascend-Powered-Engine]和所需的MindSpore版本(本示例图片为 [Mindspore-1.3.0-python3.7-aarch64],请注意使用与所选版本对应的脚本)。 |
| 启动文件 | 启动文件选择代码目录下的启动脚本。 |
| 数据集 | 数据集选择已上传到启智平台的数据集。 |
| 运行参数 | 数据存储位置和训练输出位置分别对应运行参数data_url和train_url,选择增加运行参数可以向脚本中其他参数传值,如epoch_size。在这里只需填入其他参数传值,data_url和train_url已默认加入运行参数,用户无需重复指定,只需在代码中指定。 |
| 资源池 | 规格选择[Ascend: 1 * Ascend 910 CPU:24 核 256GiB],表示单机单卡 |

注:若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend,否则默认是CPU,如下图所示
![avatar](Example_Picture/运行参数界面.png)
## 4 查看运行结果
### 4.1 在训练作业界面可以查看运行日志
![avatar](Example_Picture/查看日志页面.png)
### 4.2 训练结束后可以下载模型文件
![avatar](Example_Picture/模型下载页面.png)

+ 33
- 0
npu/config.py View File

@@ -0,0 +1,33 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py
"""

from easydict import EasyDict as edict

mnist_cfg = edict({
'num_classes': 10,
'lr': 0.01,
'momentum': 0.9,
'epoch_size': 10,
'batch_size': 32,
'buffer_size': 1000,
'image_height': 32,
'image_width': 32,
'save_checkpoint_steps': 1875,
'keep_checkpoint_max': 150,
'air_name': "lenet",
})

+ 60
- 0
npu/dataset.py View File

@@ -0,0 +1,60 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Produce the dataset
"""

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.vision import Inter
from mindspore.common import dtype as mstype


def create_dataset(data_path, batch_size=32, repeat_size=1,
num_parallel_workers=1):
"""
create dataset for train or test
"""
# define dataset
mnist_ds = ds.MnistDataset(data_path)

resize_height, resize_width = 32, 32
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081

# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)

# apply map operations on images
mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)

# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)

return mnist_ds

+ 55
- 0
npu/dataset_distributed.py View File

@@ -0,0 +1,55 @@
"""
Produce the dataset:
与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取:
get_rank:获取当前设备在集群中的ID。
get_group_size:获取集群数量。
"""
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.vision import Inter
from mindspore.common import dtype as mstype
from mindspore.communication.management import init, get_rank, get_group_size
def create_dataset_parallel(data_path, batch_size=32, repeat_size=1,
num_parallel_workers=1, shard_id=0, num_shards=8):
"""
create dataset for train or test
"""
resize_height, resize_width = 32, 32
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081
# get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters.
shard_id = get_rank()
num_shards = get_group_size()
# define dataset
mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id)
# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)
# apply map operations on images
mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)
return mnist_ds

+ 202
- 0
npu/inference.py View File

@@ -0,0 +1,202 @@
"""
######################## single-dataset inference lenet example ########################
This example is a single-dataset inference tutorial.

######################## Instructions for using the inference environment ########################
The image of the debugging environment and the image of the inference environment are two different images,
and the working local directories are different. In the inference task, you need to pay attention to the following points.
1、(1)The structure of the dataset uploaded for single dataset inference in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

(2)The dataset structure of the single dataset in the inference image in this example
workroot
├── data
| ├── test
| └── train

2、Inference task requires predefined functions
(1)Defines whether the task is a inference environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The inference task uses this parameter to represent the local path of the inference image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

(2)Copy single dataset from obs to inference image.
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return

(3)Copy ckpt file from obs to inference image.
def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
try:
mox.file.copy(obs_ckpt_url, ckpt_url)
print("Successfully Download {} to {}".format(obs_ckpt_url,
ckpt_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_ckpt_url, ckpt_url) + str(e))
return

(4)Copy the output result to obs.
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

3、4 parameters need to be defined.
--data_url is the dataset you selected on the Qizhi platform
--ckpt_url is the weight file you choose on the Qizhi platform

--data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code.

4、How the dataset is used
Inference task uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
of the dataset in the image.
For details, please refer to the following sample code.
"""

import os
import argparse
import moxing as mox
import mindspore.nn as nn
from mindspore import context
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore import Tensor
import numpy as np
from glob import glob
from dataset import create_dataset
from config import mnist_cfg as cfg
from lenet import LeNet5

### Defines whether the task is a inference environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

### Copy single dataset from obs to inference image ###
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
### Copy ckpt file from obs to inference image###
### To operate on folders, use mox.file.copy_parallel. If copying a file.
### Please use mox.file.copy to operate the file, this operation is to operate the file
def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
try:
mox.file.copy(obs_ckpt_url, ckpt_url)
print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e))
return
### Copy the output result to obs###
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task,
### otherwise an error will be reported.
### There is no need to add these parameters to the running parameters of the Qizhi platform,
### because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
type=str,
default= WorkEnvironment('train') + '/data/',
help='path where the dataset is saved')
parser.add_argument('--ckpt_url',
help='model to save/load',
default= WorkEnvironment('train') + '/checkpoint.ckpt')
parser.add_argument('--result_url',
help='result folder to save/load',
default= WorkEnvironment('train') + '/result/')
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
help='device where the code will be implemented (default: Ascend)')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
### defining the training environment
environment = 'train'
workroot = WorkEnvironment(environment)

###Initialize the data and result directories in the inference image###
data_dir = workroot + '/data'
result_dir = workroot + '/result'
ckpt_url = workroot + '/checkpoint.ckpt'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(result_dir):
os.makedirs(result_dir)
###Copy dataset from obs to inference image
obs_data_url = args.data_url
ObsToEnv(obs_data_url, data_dir)

###Copy ckpt file from obs to inference image
obs_ckpt_url = args.ckpt_url
ObsUrlToEnv(obs_ckpt_url, ckpt_url)

###Set output path result_url
obs_result_url = args.result_url

context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
repeat_size = cfg.epoch_size
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

print("============== Starting Testing ==============")

param_dict = load_checkpoint(os.path.join(ckpt_url))
load_param_into_net(network, param_dict)
ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator()
data = next(ds_test)
images = data["image"].asnumpy()
labels = data["label"].asnumpy()
print('Tensor:', Tensor(data['image']))
output = model.predict(Tensor(data['image']))
predicted = np.argmax(output.asnumpy(), axis=1)
pred = np.argmax(output.asnumpy(), axis=1)
print('predicted:', predicted)
print('pred:', pred)

print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"')
filename = 'result.txt'
file_path = os.path.join(result_dir, filename)
with open(file_path, 'a+') as file:
file.write(" {}: {:.2f} \n".format("Predicted", predicted[0]))

###Copy result data from the local running environment back to obs,
###and download it in the inference task corresponding to the Qizhi platform
EnvToObs(result_dir, obs_result_url)

+ 60
- 0
npu/lenet.py View File

@@ -0,0 +1,60 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""LeNet."""
import mindspore.nn as nn
from mindspore.common.initializer import Normal


class LeNet5(nn.Cell):
"""
Lenet network

Args:
num_class (int): Number of classes. Default: 10.
num_channel (int): Number of channels. Default: 1.

Returns:
Tensor, output tensor
Examples:
>>> LeNet(num_class=10)

"""
def __init__(self, num_class=10, num_channel=1, include_top=True):
super(LeNet5, self).__init__()
self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')
self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
self.relu = nn.ReLU()
self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
self.include_top = include_top
if self.include_top:
self.flatten = nn.Flatten()
self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02))
self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02))
self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02))

def construct(self, x):
x = self.conv1(x)
x = self.relu(x)
x = self.max_pool2d(x)
x = self.conv2(x)
x = self.relu(x)
x = self.max_pool2d(x)
if not self.include_top:
return x
x = self.flatten(x)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x

+ 193
- 0
npu/train.py View File

@@ -0,0 +1,193 @@
"""
######################## single-dataset train lenet example ########################
This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training
tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!

######################## Instructions for using the training environment ########################
The image of the debugging environment and the image of the training environment are two different images,
and the working local directories are different. In the training task, you need to pay attention to the following points.
1、(1)The structure of the dataset uploaded for single dataset training in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

(2)The dataset structure of the single dataset in the training image in this example
workroot
├── data
| ├── test
| └── train

2、Single dataset training requires predefined functions
(1)Defines whether the task is a training environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

(2)Copy single dataset from obs to training image.
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return

(3)Copy the output model to obs.
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

3、3 parameters need to be defined
--data_url is the dataset you selected on the Qizhi platform

--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code.

4、How the dataset is used
A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
of the dataset in the image.
For details, please refer to the following sample code.

"""

import os
import argparse
import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed

### Defines whether the task is a training environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

### Copy single dataset from obs to training image###
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
### Copy the output model to obs###
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset,
### otherwise an error will be reported.
###There is no need to add these parameters to the running parameters of the Qizhi platform,
###because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= WorkEnvironment('train') + '/data/')

parser.add_argument('--train_url',
help='model folder to save/load',
default= WorkEnvironment('train') + '/model/')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
### defining the training environment
environment = 'train'
workroot = WorkEnvironment(environment)

###Initialize the data and model directories in the training image###
data_dir = workroot + '/data'
train_dir = workroot + '/model'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(train_dir):
os.makedirs(train_dir)

### Copy the dataset from obs to the training image ###
ObsToEnv(args.data_url,data_dir)
###Specifies the device CPU or Ascend NPU used for training###
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])

###Copy the trained model data from the local running environment back to obs,
###and download it in the training task corresponding to the Qizhi platform
EnvToObs(train_dir, args.train_url)


+ 205
- 0
npu/train_dataparallel.py View File

@@ -0,0 +1,205 @@
"""
######################## single-dataset train lenet example ########################
This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training
tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
######################## Instructions for using the training environment ########################
The image of the debugging environment and the image of the training environment are two different images,
and the working local directories are different. In the training task, you need to pay attention to the following points.
1、(1)The structure of the dataset uploaded for single dataset training in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
(2)The dataset structure of the single dataset in the training image in this example
workroot
├── data
| ├── test
| └── train
2、Single dataset training requires predefined functions
(1)Defines whether the task is a training environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot
(2)Copy single dataset from obs to training image.
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
(3)Copy the output model to obs.
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return
3、3 parameters need to be defined
--data_url is the dataset you selected on the Qizhi platform
--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code.
4、How the dataset is used
A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
of the dataset in the image.
For details, please refer to the following sample code.
"""
import os
import argparse
from dataset_distributed import create_dataset_parallel
import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.common import set_seed
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank, get_group_size
import mindspore.ops as ops
# set device_id and init
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(device_id=device_id)
init()
### Defines whether the task is a training environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot
### Copy single dataset from obs to training image###
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
### Copy the output model to obs###
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return
### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset,
### otherwise an error will be reported.
###There is no need to add these parameters to the running parameters of the Qizhi platform,
###because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= WorkEnvironment('train') + '/data/')
parser.add_argument('--train_url',
help='model folder to save/load',
default= WorkEnvironment('train') + '/model/')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')
set_seed(114514)
if __name__ == "__main__":
args = parser.parse_args()
### defining the training environment
environment = 'train'
workroot = WorkEnvironment(environment)
###Initialize the data and model directories in the training image###
data_dir = workroot + '/data'
train_dir = workroot + '/model'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(train_dir):
os.makedirs(train_dir)
### Copy the dataset from obs to the training image ###
ObsToEnv(args.data_url,data_dir)
context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")
config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In the example, get_rank() is added to distinguish different paths.
ckpoint_cb = ModelCheckpoint(prefix="data_parallel",
directory=train_dir + "/" + str(get_rank()) + "/",
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)
model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()], dataset_sink_mode=True)
###Copy the trained model data from the local running environment back to obs,
###and download it in the training task corresponding to the Qizhi platform
EnvToObs(train_dir, args.train_url)

+ 92
- 0
npu/train_for_c2net.py View File

@@ -0,0 +1,92 @@
"""
######################## train lenet example ########################
train lenet and get network model files(.ckpt)

The training of the intelligent computing network currently supports single dataset training, and does not require
the obs copy process.It only needs to define two parameters and then call it directly:
train_dir = '/cache/output' #The location of the output
data_dir = '/cache/dataset' #The location of the dataset
"""
#!/usr/bin/python
#coding=utf-8

import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

set_seed(1)

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
print('args:')
print(args)

###define two parameters and then call it directly###
train_dir = '/cache/output'
data_dir = '/cache/dataset'
###Specifies the device CPU or Ascend NPU used for training###
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])

print("============== Finish Training ==============")

+ 237
- 0
npu/train_for_multidataset.py View File

@@ -0,0 +1,237 @@
"""
######################## multi-dataset train lenet example ########################
This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset
training tutorial train.py. This example cannot be used for a single dataset!
"""
"""
######################## Instructions for using the training environment ########################
1、(1)The structure of the dataset uploaded for multi-dataset training in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
checkpoint_lenet-1_1875.zip
├── checkpoint_lenet-1_1875.ckpt

(2)The dataset structure in the training image for multiple datasets in this example
workroot
├── MNISTData
| ├── test
| └── train
└── checkpoint_lenet-1_1875
├── checkpoint_lenet-1_1875.ckpt

2、Multi-dataset training requires predefined functions
(1)Defines whether the task is a training environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

(2)Copy multiple datasets from obs to training image
def MultiObsToEnv(multi_data_url, workroot):
multi_data_json = json.loads(multi_data_url) #Parse multi_data_url
for i in range(len(multi_data_json)):
path = workroot + "/" + multi_data_json[i]["dataset_name"]
if not os.path.exists(path):
os.makedirs(path)
try:
mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path)
print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],
path))
except Exception as e:
print('moxing download {} to {} failed: '.format(
multi_data_json[i]["dataset_url"], path) + str(e))
return

***The input and output of the MultiObsToEnv function in this example:
Input for multi_data_url:
[
{
"dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e
ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset
"dataset_name": "MNIST_Data" #the name of the dataset
},
{
"dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c
59be66-64ec-41ca-b311-f51a486eabf8/",
"dataset_name": "checkpoint_lenet-1_1875"
}
]
Purpose of multi_data_url:
The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image
and build the dataset path in the training image.
For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData,
The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875

(3)Copy the output model to obs.
def EnvToObs(obs_train_url, train_dir):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,
obs_train_url) + str(e))
return

3、4 parameters need to be defined
--data_url is the first dataset you selected on the Qizhi platform
--multi_data_url is the multi-dataset you selected on the Qizhi platform

--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code

4、How the dataset is used
Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the
calling path of the dataset in the training image.
For example, the calling path of the train folder in the MNIST_Data dataset in this example is
workroot + "/MNIST_Data" +"/train"

For details, please refer to the following sample code.
"""

import os
import argparse

import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import json
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed
from mindspore import load_checkpoint, load_param_into_net

### Defines whether the task is a training environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/ma-user/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

### Copy multiple datasets from obs to training image ###
def MultiObsToEnv(multi_data_url, workroot):
multi_data_json = json.loads(multi_data_url)
for i in range(len(multi_data_json)):
path = workroot + "/" + multi_data_json[i]["dataset_name"]
if not os.path.exists(path):
os.makedirs(path)
try:
mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path)
print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],
path))
except Exception as e:
print('moxing download {} to {} failed: '.format(
multi_data_json[i]["dataset_url"], path) + str(e))
return
### Copy the output model to obs ###
def EnvToObs(obs_train_url, train_dir):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,
obs_train_url) + str(e))
return


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
### otherwise an error will be reported.
### There is no need to add these parameters to the running parameters of the Qizhi platform,
### because they are predefined in the background, you only need to define them in your code.
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= WorkEnvironment('train') + '/data/')

parser.add_argument('--multi_data_url',
help='path to multi dataset',
default= WorkEnvironment('train'))

parser.add_argument('--train_url',
help='model folder to save/load',
default= WorkEnvironment('train') + '/model/')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
# After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to
# copy multiple datasets from obs to the training image
environment = 'train'
workroot = WorkEnvironment(environment)
MultiObsToEnv(args.multi_data_url, workroot)

### Define the output path in the training image
train_dir = workroot + '/model'
if not os.path.exists(train_dir):
os.makedirs(train_dir)

context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
#The dataset path is used here:workroot + "/MNIST_Data" +"/train" ""
ds_train = create_dataset(os.path.join(workroot + "/MNISTData", "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt"
load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875",
"checkpoint_lenet-1_1875.ckpt")))

if args.device_target != "Ascend":
model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
else:
model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")

config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])
###Copy the trained model data from the local running environment back to obs,
###and download it in the training task corresponding to the Qizhi platform
EnvToObs(train_dir, args.train_url)

BIN
npu_test_model_4wxt_0.0.1.zip View File


+ 16
- 0
running.py View File

@@ -0,0 +1,16 @@
#!/usr/bin/python
#-*- coding: UTF-8 -*-
import time
import datetime

timeStart = datetime.datetime.now()
print(timeStart.strftime('%Y-%m-%d %H:%M:%S'))
for letter in 'Python':
print('当前字母:%s' % letter)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
time.sleep(30)

timeEnd = datetime.datetime.now()
print(timeEnd.strftime('%Y-%m-%d %H:%M:%S'))
print('经历多少秒:%s' % (timeEnd - timeStart).seconds)

+ 1
- 0
test.py View File

@@ -0,0 +1 @@
print('for test only')

+ 93
- 0
test_c2net_npu.py View File

@@ -0,0 +1,93 @@
#!/usr/bin/python
#coding=utf-8

"""
######################## train lenet example ########################
train lenet and get network model files(.ckpt)
"""

import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

set_seed(1)

if __name__ == "__main__":
args = parser.parse_args()
print('args:')
print(args)

# train_dir = '/tmp/output'
# data_dir = '/tmp/dataset'
train_dir = '/cache/output'
data_dir = '/cache/dataset'
#注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
#创建数据集
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
#创建网络
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#定义模型输出路径
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
#开始训练
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])

print("============== Finish Training ==============")

+ 158
- 0
tf_train.py View File

@@ -0,0 +1,158 @@
# coding: utf-8
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
import argparse
import moxing as mox

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
workroot = '/home/work/user-job-dir'
#初始化过滤器
def weight_variable(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
#初始化偏置,初始化时,所有值是0.1
def bias_variable(shape):
return tf.Variable(tf.constant(0.1, shape=shape))
#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1
#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")
#池化运算
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")

def parse_args():
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

# define 2 parameters for running on modelArts
# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= workroot + '/data/')

parser.add_argument('--train_url',
help='model folder to save/load',
default= workroot + '/model/')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')

#modelarts已经默认使用data_url和train_url
parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

args = parser.parse_args()
return args

if __name__ == "__main__":
args = parse_args()
print('args:')
print(args)

mnist = input_data.read_data_sets('mnist_data', one_hot=True)
#创建x占位符,用于临时存放MNIST图片的数据,
# [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784)
x = tf.placeholder(tf.float32, [None, 784], name='input')
#y_存的是实际图像的标签,即对应于每张输入图片实际的值
y_ = tf.placeholder(tf.float32, [None, 10])
#将图片从784维向量重新还原为28×28的矩阵图片,
# 原因参考卷积神经网络模型图,最后一个参数代表深度,
# 因为MNIST是黑白图片,所以深度为1,
# 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了
x_image = tf.reshape(x, [-1, 28, 28, 1])
#第一层卷积
#将过滤器设置成5×5×1的矩阵,
#其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1
#32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64
W_conv1 = weight_variable([5, 5, 1, 32])
#有多少个特征图就有多少个偏置
b_conv1 = bias_variable([32])
#使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
#卷积以后再经过池化操作
h_pool1 = max_pool_2x2(h_conv1)
#第二层卷积
#因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
#全连接层
#经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2),
#第二层池化后输出为(14/2)×(14/2)),深度为64,
#我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024]
W_fc1 = weight_variable([7 * 7 * 64, 1024])
#偏置的个数和权重的个数一致
b_fc1 = bias_variable([1024])
#这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
#使用ReLU激活函数
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
#dropout
#为了减少过拟合,我们在输出层之前加入dropout
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#输出层
#全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9),
# 所以这里权重W的尺寸为[1024, 10]
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
#最后都要经过Softmax函数将输出转化为概率问题
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output')
#损失函数和损失优化
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv)))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#测试准确率,跟Softmax回归模型的一样
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
train_dir = workroot + '/model/' #模型存放路径
if not os.path.exists(train_dir):
os.mkdir(train_dir)
obs_train_url = args.train_url
#开始训练
with tf.Session() as sess:
#初始化所有变量
sess.run(tf.global_variables_initializer())
#训练两万次
for i in range(2000):
#每次获取50张图片数据和对应的标签
batch = mnist.train.next_batch(50)
#每训练100次,我们打印一次训练的准确率
if i % 100 == 0:
train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0})
print("step %d, training accuracy %g" % (i, train_accuracy))
#这里是真的训练,将数据传入
sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5})
# 用SavedModel的方式保存
tf.compat.v1.saved_model.simple_save(sess,
train_dir +"saved_model",
inputs={"input": x, 'keep_prob':keep_prob},
outputs={"output": y_conv})
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))

+ 146
- 0
tf_train_new.py View File

@@ -0,0 +1,146 @@
# coding: utf-8
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
mnist = input_data.read_data_sets('mnist_data', one_hot=True)
#初始化过滤器
def weight_variable(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
#初始化偏置,初始化时,所有值是0.1
def bias_variable(shape):
return tf.Variable(tf.constant(0.1, shape=shape))
#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1
#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME")
#池化运算
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
#创建x占位符,用于临时存放MNIST图片的数据,
# [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784)
x = tf.placeholder(tf.float32, [None, 784], name='input')
#y_存的是实际图像的标签,即对应于每张输入图片实际的值
y_ = tf.placeholder(tf.float32, [None, 10])
#将图片从784维向量重新还原为28×28的矩阵图片,
# 原因参考卷积神经网络模型图,最后一个参数代表深度,
# 因为MNIST是黑白图片,所以深度为1,
# 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了
x_image = tf.reshape(x, [-1, 28, 28, 1])
#第一层卷积
#将过滤器设置成5×5×1的矩阵,
#其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1
#32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64
W_conv1 = weight_variable([5, 5, 1, 32])
#有多少个特征图就有多少个偏置
b_conv1 = bias_variable([32])
#使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
#卷积以后再经过池化操作
h_pool1 = max_pool_2x2(h_conv1)
#第二层卷积
#因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
#全连接层
#经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2),
#第二层池化后输出为(14/2)×(14/2)),深度为64,
#我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024]
W_fc1 = weight_variable([7 * 7 * 64, 1024])
#偏置的个数和权重的个数一致
b_fc1 = bias_variable([1024])
#这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了)
h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
#使用ReLU激活函数
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
#dropout
#为了减少过拟合,我们在输出层之前加入dropout
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
#输出层
#全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9),
# 所以这里权重W的尺寸为[1024, 10]
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
#最后都要经过Softmax函数将输出转化为概率问题
y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output')
#损失函数和损失优化
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv)))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#测试准确率,跟Softmax回归模型的一样
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# #将训练结果保存,如果不保存我们这次训练结束后的结果也随着程序运行结束而释放了
# savePath = './mnist_conv/'
# saveFile = savePath + 'mnist_conv.ckpt'
# if os.path.exists(savePath) == False:
# os.mkdir(savePath)
# saver = tf.train.Saver()
#开始训练
with tf.Session() as sess:
#初始化所有变量
sess.run(tf.global_variables_initializer())
#训练两万次
for i in range(2000):
#每次获取50张图片数据和对应的标签
batch = mnist.train.next_batch(50)
#每训练100次,我们打印一次训练的准确率
if i % 100 == 0:
train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0})
print("step %d, training accuracy %g" % (i, train_accuracy))
#这里是真的训练,将数据传入
sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5})
# print ("end train, start testing...")
# mean_value = 0.0
# for i in range(mnist.test.labels.shape[0]):
# batch = mnist.test.next_batch(50)
# train_accuracy = sess.run(accuracy, feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0})
# mean_value += train_accuracy
# print("test accuracy %g" % (mean_value / mnist.test.labels.shape[0]))
# #训练结束后,我们使用mnist.test在测试最后的准确率
# print("test accuracy %g" % sess.run(accuracy, feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0}))
# 最后,将会话保存下来
# saver.save(sess, saveFile)
# 用SavedModel的方式保存
tf.compat.v1.saved_model.simple_save(sess,
"./saved_model",
inputs={"input": x, 'keep_prob':keep_prob},
outputs={"output": y_conv})

+ 185
- 0
train.py View File

@@ -0,0 +1,185 @@
"""
######################## train lenet example ########################
train lenet and get network model files(.ckpt)
"""
"""
######################## 训练环境使用说明 ########################
假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作:
1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换
在调试环境中:
args.data_url = '/home/ma-user/work/data/' //数据集位置
args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置
在训练环境变换为:
args.data_url = '/home/work/user-job-dir/data/'
args.train_url = '/home/work/user-job-dir/model/'
2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs.
将数据集从obs拷贝到训练镜像中:

obs_data_url = args.data_url
args.data_url = '/home/work/user-job-dir/data/'
if not os.path.exists(args.data_url):
os.mkdir(args.data_url)
try:
mox.file.copy_parallel(obs_data_url, args.data_url)
print("Successfully Download {} to {}".format(obs_data_url,
args.data_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_data_url, args.data_url) + str(e))

将输出的模型拷贝到obs:
obs_train_url = args.train_url
args.train_url = '/home/work/user-job-dir/model/'
if not os.path.exists(args.train_url):
os.mkdir(args.train_url)
try:
mox.file.copy_parallel(args.train_url, obs_train_url)
print("Successfully Upload {} to {}".format(args.train_url,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(args.train_url,
obs_train_url) + str(e))

"""

import os
import numpy as np
import argparse
import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed
from mindspore import Tensor, export

#配置默认的工作空间根目录
# environment = 'debug'
environment = 'train'
if environment == 'debug':
workroot = '/home/ma-user/work' #调试任务使用该参数
else:
workroot = '/home/work/user-job-dir' # 训练任务使用该参数
print('current work mode:' + environment + ', workroot:' + workroot)

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

# define 2 parameters for running on modelArts
# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= workroot + '/data/')

parser.add_argument('--train_url',
help='model folder to save/load',
default= workroot + '/model/')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')

#modelarts已经默认使用data_url和train_url
parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

set_seed(1)

if __name__ == "__main__":
args = parser.parse_args()
print('args:')
print(args)
data_dir = workroot + '/data' #数据集存放路径
train_dir = workroot + '/model' #模型存放路径
#初始化数据存放目录
if not os.path.exists(data_dir):
os.mkdir(data_dir)
#初始化模型存放目录
obs_train_url = args.train_url
train_dir = workroot + '/model/'
if not os.path.exists(train_dir):
os.mkdir(train_dir)
######################## 将数据集从obs拷贝到训练镜像中 (固定写法)########################
# 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录
#创建数据存放的位置
if environment == 'train':
obs_data_url = args.data_url
#将数据拷贝到训练环境
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url,
data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_data_url, data_dir) + str(e))
######################## 将数据集从obs拷贝到训练镜像中 ########################
#注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
#创建数据集
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
#创建网络
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#定义模型输出路径
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
#开始训练
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])
input = np.random.uniform(0.0, 1.0, size=[1, 1, 32, 32]).astype(np.float32)
export(network, Tensor(input), file_name=(train_dir +'LeNet5_model'), file_format='MINDIR')

export(network, Tensor(input), file_name=(train_dir +'LeNet5_onnx_model'), file_format='ONNX')
######################## 将输出的模型拷贝到obs(固定写法) ########################
# 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载
if environment == 'train':
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,
obs_train_url) + str(e))
######################## 将输出的模型拷贝到obs ########################

+ 207
- 0
train_longparam.py View File

@@ -0,0 +1,207 @@
"""
######################## train lenet example ########################
train lenet and get network model files(.ckpt)
"""
"""
######################## 训练环境使用说明 ########################
假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作:
1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换
在调试环境中:
args.data_url = '/home/ma-user/work/data/' //数据集位置
args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置
在训练环境变换为:
args.data_url = '/home/work/user-job-dir/data/'
args.train_url = '/home/work/user-job-dir/model/'
2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs.
将数据集从obs拷贝到训练镜像中:

obs_data_url = args.data_url
args.data_url = '/home/work/user-job-dir/data/'
if not os.path.exists(args.data_url):
os.mkdir(args.data_url)
try:
mox.file.copy_parallel(obs_data_url, args.data_url)
print("Successfully Download {} to {}".format(obs_data_url,
args.data_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_data_url, args.data_url) + str(e))

将输出的模型拷贝到obs:
obs_train_url = args.train_url
args.train_url = '/home/work/user-job-dir/model/'
if not os.path.exists(args.train_url):
os.mkdir(args.train_url)
try:
mox.file.copy_parallel(args.train_url, obs_train_url)
print("Successfully Upload {} to {}".format(args.train_url,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(args.train_url,
obs_train_url) + str(e))

"""

import os
import numpy as np
import argparse
import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed
from mindspore import Tensor, export

#配置默认的工作空间根目录
# environment = 'debug'
environment = 'train'
if environment == 'debug':
workroot = '/home/ma-user/work' #调试任务使用该参数
else:
workroot = '/home/work/user-job-dir' # 训练任务使用该参数
print('current work mode:' + environment + ', workroot:' + workroot)

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

# define 2 parameters for running on modelArts
# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= workroot + '/data/')

parser.add_argument('--train_url',
help='model folder to save/load',
default= workroot + '/model/')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend')

#modelarts已经默认使用data_url和train_url
parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

parser.add_argument('--openI',
help='model folder to save/load',
default= True)
parser.add_argument('--sink_mode',
help='model folder to save/load',
default= True)
parser.add_argument('--dataset',
help='model folder to save/load',
default= 'hmdb51')
parser.add_argument('--checkpoint_path',
help='model folder to save/load',
default= './src/pretrained/rgb_imagenet.ckpt')
parser.add_argument('--mode',
help='model folder to save/load',
default= 'rgb')
parser.add_argument('--num_epochs',
help='model folder to save/load',
default= 40)
parser.add_argument('--distributed',
help='model folder to save/load',
default= True)

set_seed(1)

if __name__ == "__main__":
args = parser.parse_args()
print('args:')
print(args)
data_dir = workroot + '/data' #数据集存放路径
train_dir = workroot + '/model' #模型存放路径
#初始化数据存放目录
if not os.path.exists(data_dir):
os.mkdir(data_dir)
#初始化模型存放目录
obs_train_url = args.train_url
train_dir = workroot + '/model/'
if not os.path.exists(train_dir):
os.mkdir(train_dir)
######################## 将数据集从obs拷贝到训练镜像中 (固定写法)########################
# 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录
#创建数据存放的位置
if environment == 'train':
obs_data_url = args.data_url
#将数据拷贝到训练环境
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url,
data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_data_url, data_dir) + str(e))
######################## 将数据集从obs拷贝到训练镜像中 ########################
#注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
#创建数据集
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
#创建网络
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#定义模型输出路径
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
#开始训练
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])
input = np.random.uniform(0.0, 1.0, size=[1, 1, 32, 32]).astype(np.float32)
export(network, Tensor(input), file_name=(train_dir +'LeNet5_model'), file_format='MINDIR')

export(network, Tensor(input), file_name=(train_dir +'LeNet5_onnx_model'), file_format='ONNX')
######################## 将输出的模型拷贝到obs(固定写法) ########################
# 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载
if environment == 'train':
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,
obs_train_url) + str(e))
######################## 将输出的模型拷贝到obs ########################

Loading…
Cancel
Save