Browse Source

rm files

test_95%
wjtest001 3 years ago
parent
commit
22582c3c63
9 changed files with 0 additions and 1137 deletions
  1. +0
    -33
      npu/config.py
  2. +0
    -60
      npu/dataset.py
  3. +0
    -55
      npu/dataset_distributed.py
  4. +0
    -202
      npu/inference.py
  5. +0
    -60
      npu/lenet.py
  6. +0
    -193
      npu/train.py
  7. +0
    -205
      npu/train_dataparallel.py
  8. +0
    -92
      npu/train_for_c2net.py
  9. +0
    -237
      npu/train_for_multidataset.py

+ 0
- 33
npu/config.py View File

@@ -1,33 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in train.py
"""

from easydict import EasyDict as edict

mnist_cfg = edict({
'num_classes': 10,
'lr': 0.01,
'momentum': 0.9,
'epoch_size': 10,
'batch_size': 32,
'buffer_size': 1000,
'image_height': 32,
'image_width': 32,
'save_checkpoint_steps': 1875,
'keep_checkpoint_max': 150,
'air_name': "lenet",
})

+ 0
- 60
npu/dataset.py View File

@@ -1,60 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Produce the dataset
"""

import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.vision import Inter
from mindspore.common import dtype as mstype


def create_dataset(data_path, batch_size=32, repeat_size=1,
num_parallel_workers=1):
"""
create dataset for train or test
"""
# define dataset
mnist_ds = ds.MnistDataset(data_path)

resize_height, resize_width = 32, 32
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081

# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)

# apply map operations on images
mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)

# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)

return mnist_ds

+ 0
- 55
npu/dataset_distributed.py View File

@@ -1,55 +0,0 @@
"""
Produce the dataset:
与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取:
get_rank:获取当前设备在集群中的ID。
get_group_size:获取集群数量。
"""
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.vision import Inter
from mindspore.common import dtype as mstype
from mindspore.communication.management import init, get_rank, get_group_size
def create_dataset_parallel(data_path, batch_size=32, repeat_size=1,
num_parallel_workers=1, shard_id=0, num_shards=8):
"""
create dataset for train or test
"""
resize_height, resize_width = 32, 32
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081
# get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters.
shard_id = get_rank()
num_shards = get_group_size()
# define dataset
mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id)
# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)
# apply map operations on images
mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)
return mnist_ds

+ 0
- 202
npu/inference.py View File

@@ -1,202 +0,0 @@
"""
######################## single-dataset inference lenet example ########################
This example is a single-dataset inference tutorial.

######################## Instructions for using the inference environment ########################
The image of the debugging environment and the image of the inference environment are two different images,
and the working local directories are different. In the inference task, you need to pay attention to the following points.
1、(1)The structure of the dataset uploaded for single dataset inference in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

(2)The dataset structure of the single dataset in the inference image in this example
workroot
├── data
| ├── test
| └── train

2、Inference task requires predefined functions
(1)Defines whether the task is a inference environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The inference task uses this parameter to represent the local path of the inference image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

(2)Copy single dataset from obs to inference image.
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return

(3)Copy ckpt file from obs to inference image.
def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
try:
mox.file.copy(obs_ckpt_url, ckpt_url)
print("Successfully Download {} to {}".format(obs_ckpt_url,
ckpt_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(
obs_ckpt_url, ckpt_url) + str(e))
return

(4)Copy the output result to obs.
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

3、4 parameters need to be defined.
--data_url is the dataset you selected on the Qizhi platform
--ckpt_url is the weight file you choose on the Qizhi platform

--data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code.

4、How the dataset is used
Inference task uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
of the dataset in the image.
For details, please refer to the following sample code.
"""

import os
import argparse
import moxing as mox
import mindspore.nn as nn
from mindspore import context
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore import Tensor
import numpy as np
from glob import glob
from dataset import create_dataset
from config import mnist_cfg as cfg
from lenet import LeNet5

### Defines whether the task is a inference environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

### Copy single dataset from obs to inference image ###
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
### Copy ckpt file from obs to inference image###
### To operate on folders, use mox.file.copy_parallel. If copying a file.
### Please use mox.file.copy to operate the file, this operation is to operate the file
def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
try:
mox.file.copy(obs_ckpt_url, ckpt_url)
print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e))
return
### Copy the output result to obs###
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task,
### otherwise an error will be reported.
### There is no need to add these parameters to the running parameters of the Qizhi platform,
### because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
type=str,
default= WorkEnvironment('train') + '/data/',
help='path where the dataset is saved')
parser.add_argument('--ckpt_url',
help='model to save/load',
default= WorkEnvironment('train') + '/checkpoint.ckpt')
parser.add_argument('--result_url',
help='result folder to save/load',
default= WorkEnvironment('train') + '/result/')
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
help='device where the code will be implemented (default: Ascend)')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
### defining the training environment
environment = 'train'
workroot = WorkEnvironment(environment)

###Initialize the data and result directories in the inference image###
data_dir = workroot + '/data'
result_dir = workroot + '/result'
ckpt_url = workroot + '/checkpoint.ckpt'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(result_dir):
os.makedirs(result_dir)
###Copy dataset from obs to inference image
obs_data_url = args.data_url
ObsToEnv(obs_data_url, data_dir)

###Copy ckpt file from obs to inference image
obs_ckpt_url = args.ckpt_url
ObsUrlToEnv(obs_ckpt_url, ckpt_url)

###Set output path result_url
obs_result_url = args.result_url

context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
repeat_size = cfg.epoch_size
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

print("============== Starting Testing ==============")

param_dict = load_checkpoint(os.path.join(ckpt_url))
load_param_into_net(network, param_dict)
ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator()
data = next(ds_test)
images = data["image"].asnumpy()
labels = data["label"].asnumpy()
print('Tensor:', Tensor(data['image']))
output = model.predict(Tensor(data['image']))
predicted = np.argmax(output.asnumpy(), axis=1)
pred = np.argmax(output.asnumpy(), axis=1)
print('predicted:', predicted)
print('pred:', pred)

print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"')
filename = 'result.txt'
file_path = os.path.join(result_dir, filename)
with open(file_path, 'a+') as file:
file.write(" {}: {:.2f} \n".format("Predicted", predicted[0]))

###Copy result data from the local running environment back to obs,
###and download it in the inference task corresponding to the Qizhi platform
EnvToObs(result_dir, obs_result_url)

+ 0
- 60
npu/lenet.py View File

@@ -1,60 +0,0 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""LeNet."""
import mindspore.nn as nn
from mindspore.common.initializer import Normal


class LeNet5(nn.Cell):
"""
Lenet network

Args:
num_class (int): Number of classes. Default: 10.
num_channel (int): Number of channels. Default: 1.

Returns:
Tensor, output tensor
Examples:
>>> LeNet(num_class=10)

"""
def __init__(self, num_class=10, num_channel=1, include_top=True):
super(LeNet5, self).__init__()
self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')
self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
self.relu = nn.ReLU()
self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
self.include_top = include_top
if self.include_top:
self.flatten = nn.Flatten()
self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02))
self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02))
self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02))

def construct(self, x):
x = self.conv1(x)
x = self.relu(x)
x = self.max_pool2d(x)
x = self.conv2(x)
x = self.relu(x)
x = self.max_pool2d(x)
if not self.include_top:
return x
x = self.flatten(x)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x

+ 0
- 193
npu/train.py View File

@@ -1,193 +0,0 @@
"""
######################## single-dataset train lenet example ########################
This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training
tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!

######################## Instructions for using the training environment ########################
The image of the debugging environment and the image of the training environment are two different images,
and the working local directories are different. In the training task, you need to pay attention to the following points.
1、(1)The structure of the dataset uploaded for single dataset training in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte

(2)The dataset structure of the single dataset in the training image in this example
workroot
├── data
| ├── test
| └── train

2、Single dataset training requires predefined functions
(1)Defines whether the task is a training environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

(2)Copy single dataset from obs to training image.
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return

(3)Copy the output model to obs.
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

3、3 parameters need to be defined
--data_url is the dataset you selected on the Qizhi platform

--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code.

4、How the dataset is used
A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
of the dataset in the image.
For details, please refer to the following sample code.

"""

import os
import argparse
import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed

### Defines whether the task is a training environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

### Copy single dataset from obs to training image###
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
### Copy the output model to obs###
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return

### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset,
### otherwise an error will be reported.
###There is no need to add these parameters to the running parameters of the Qizhi platform,
###because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= WorkEnvironment('train') + '/data/')

parser.add_argument('--train_url',
help='model folder to save/load',
default= WorkEnvironment('train') + '/model/')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
### defining the training environment
environment = 'train'
workroot = WorkEnvironment(environment)

###Initialize the data and model directories in the training image###
data_dir = workroot + '/data'
train_dir = workroot + '/model'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(train_dir):
os.makedirs(train_dir)

### Copy the dataset from obs to the training image ###
ObsToEnv(args.data_url,data_dir)
###Specifies the device CPU or Ascend NPU used for training###
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])

###Copy the trained model data from the local running environment back to obs,
###and download it in the training task corresponding to the Qizhi platform
EnvToObs(train_dir, args.train_url)


+ 0
- 205
npu/train_dataparallel.py View File

@@ -1,205 +0,0 @@
"""
######################## single-dataset train lenet example ########################
This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training
tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
######################## Instructions for using the training environment ########################
The image of the debugging environment and the image of the training environment are two different images,
and the working local directories are different. In the training task, you need to pay attention to the following points.
1、(1)The structure of the dataset uploaded for single dataset training in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
(2)The dataset structure of the single dataset in the training image in this example
workroot
├── data
| ├── test
| └── train
2、Single dataset training requires predefined functions
(1)Defines whether the task is a training environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot
(2)Copy single dataset from obs to training image.
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
(3)Copy the output model to obs.
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return
3、3 parameters need to be defined
--data_url is the dataset you selected on the Qizhi platform
--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code.
4、How the dataset is used
A single dataset uses data_url as the input, and data_dir (ie: workroot + '/data') as the calling method
of the dataset in the image.
For details, please refer to the following sample code.
"""
import os
import argparse
from dataset_distributed import create_dataset_parallel
import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.common import set_seed
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank, get_group_size
import mindspore.ops as ops
# set device_id and init
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(device_id=device_id)
init()
### Defines whether the task is a training environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot
### Copy single dataset from obs to training image###
def ObsToEnv(obs_data_url, data_dir):
try:
mox.file.copy_parallel(obs_data_url, data_dir)
print("Successfully Download {} to {}".format(obs_data_url, data_dir))
except Exception as e:
print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
return
### Copy the output model to obs###
def EnvToObs(train_dir, obs_train_url):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
return
### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset,
### otherwise an error will be reported.
###There is no need to add these parameters to the running parameters of the Qizhi platform,
###because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= WorkEnvironment('train') + '/data/')
parser.add_argument('--train_url',
help='model folder to save/load',
default= WorkEnvironment('train') + '/model/')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')
set_seed(114514)
if __name__ == "__main__":
args = parser.parse_args()
### defining the training environment
environment = 'train'
workroot = WorkEnvironment(environment)
###Initialize the data and model directories in the training image###
data_dir = workroot + '/data'
train_dir = workroot + '/model'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
if not os.path.exists(train_dir):
os.makedirs(train_dir)
### Copy the dataset from obs to the training image ###
ObsToEnv(args.data_url,data_dir)
context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")
config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In the example, get_rank() is added to distinguish different paths.
ckpoint_cb = ModelCheckpoint(prefix="data_parallel",
directory=train_dir + "/" + str(get_rank()) + "/",
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)
model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()], dataset_sink_mode=True)
###Copy the trained model data from the local running environment back to obs,
###and download it in the training task corresponding to the Qizhi platform
EnvToObs(train_dir, args.train_url)

+ 0
- 92
npu/train_for_c2net.py View File

@@ -1,92 +0,0 @@
"""
######################## train lenet example ########################
train lenet and get network model files(.ckpt)

The training of the intelligent computing network currently supports single dataset training, and does not require
the obs copy process.It only needs to define two parameters and then call it directly:
train_dir = '/cache/output' #The location of the output
data_dir = '/cache/dataset' #The location of the dataset
"""
#!/usr/bin/python
#coding=utf-8

import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

set_seed(1)

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
print('args:')
print(args)

###define two parameters and then call it directly###
train_dir = '/cache/output'
data_dir = '/cache/dataset'
###Specifies the device CPU or Ascend NPU used for training###
context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
ds_train = create_dataset(os.path.join(data_dir, "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])

print("============== Finish Training ==============")

+ 0
- 237
npu/train_for_multidataset.py View File

@@ -1,237 +0,0 @@
"""
######################## multi-dataset train lenet example ########################
This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset
training tutorial train.py. This example cannot be used for a single dataset!
"""
"""
######################## Instructions for using the training environment ########################
1、(1)The structure of the dataset uploaded for multi-dataset training in this example
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
checkpoint_lenet-1_1875.zip
├── checkpoint_lenet-1_1875.ckpt

(2)The dataset structure in the training image for multiple datasets in this example
workroot
├── MNISTData
| ├── test
| └── train
└── checkpoint_lenet-1_1875
├── checkpoint_lenet-1_1875.ckpt

2、Multi-dataset training requires predefined functions
(1)Defines whether the task is a training environment or a debugging environment.
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir' #The training task uses this parameter to represent the local path of the training image
elif environment == 'debug':
workroot = '/home/ma-user/work' #The debug task uses this parameter to represent the local path of the debug image
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

(2)Copy multiple datasets from obs to training image
def MultiObsToEnv(multi_data_url, workroot):
multi_data_json = json.loads(multi_data_url) #Parse multi_data_url
for i in range(len(multi_data_json)):
path = workroot + "/" + multi_data_json[i]["dataset_name"]
if not os.path.exists(path):
os.makedirs(path)
try:
mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path)
print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],
path))
except Exception as e:
print('moxing download {} to {} failed: '.format(
multi_data_json[i]["dataset_url"], path) + str(e))
return

***The input and output of the MultiObsToEnv function in this example:
Input for multi_data_url:
[
{
"dataset_url": "s3://test-opendata/attachment/e/a/eae3a316-42d6-4a43-a484-1fa573eab388e
ae3a316-42d6-4a43-a484-1fa573eab388/", #obs path of the dataset
"dataset_name": "MNIST_Data" #the name of the dataset
},
{
"dataset_url": "s3://test-opendata/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf82c
59be66-64ec-41ca-b311-f51a486eabf8/",
"dataset_name": "checkpoint_lenet-1_1875"
}
]
Purpose of multi_data_url:
The purpose of the MultiObsToEnv function is to copy multiple datasets from obs to the training image
and build the dataset path in the training image.
For example, the path of the MNIST_Data dataset in this example is /home/work/user-job-dir/MNISTData,
The path to the checkpoint_lenet-1_1875 dataset is /home/work/user-job-dir/checkpoint_lenet-1_1875

(3)Copy the output model to obs.
def EnvToObs(obs_train_url, train_dir):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,
obs_train_url) + str(e))
return

3、4 parameters need to be defined
--data_url is the first dataset you selected on the Qizhi platform
--multi_data_url is the multi-dataset you selected on the Qizhi platform

--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task,
otherwise an error will be reported.
There is no need to add these parameters to the running parameters of the Qizhi platform,
because they are predefined in the background, you only need to define them in your code

4、How the dataset is used
Multi-datasets use multi_data_url as input, workroot + dataset name + file or folder name in the dataset as the
calling path of the dataset in the training image.
For example, the calling path of the train folder in the MNIST_Data dataset in this example is
workroot + "/MNIST_Data" +"/train"

For details, please refer to the following sample code.
"""

import os
import argparse

import moxing as mox
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import json
import mindspore.nn as nn
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.common import set_seed
from mindspore import load_checkpoint, load_param_into_net

### Defines whether the task is a training environment or a debugging environment ###
def WorkEnvironment(environment):
if environment == 'train':
workroot = '/home/work/user-job-dir'
elif environment == 'debug':
workroot = '/home/ma-user/work'
print('current work mode:' + environment + ', workroot:' + workroot)
return workroot

### Copy multiple datasets from obs to training image ###
def MultiObsToEnv(multi_data_url, workroot):
multi_data_json = json.loads(multi_data_url)
for i in range(len(multi_data_json)):
path = workroot + "/" + multi_data_json[i]["dataset_name"]
if not os.path.exists(path):
os.makedirs(path)
try:
mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path)
print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],
path))
except Exception as e:
print('moxing download {} to {} failed: '.format(
multi_data_json[i]["dataset_url"], path) + str(e))
return
### Copy the output model to obs ###
def EnvToObs(obs_train_url, train_dir):
try:
mox.file.copy_parallel(train_dir, obs_train_url)
print("Successfully Upload {} to {}".format(train_dir,
obs_train_url))
except Exception as e:
print('moxing upload {} to {} failed: '.format(train_dir,
obs_train_url) + str(e))
return


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
### otherwise an error will be reported.
### There is no need to add these parameters to the running parameters of the Qizhi platform,
### because they are predefined in the background, you only need to define them in your code.
parser.add_argument('--data_url',
help='path to training/inference dataset folder',
default= WorkEnvironment('train') + '/data/')

parser.add_argument('--multi_data_url',
help='path to multi dataset',
default= WorkEnvironment('train'))

parser.add_argument('--train_url',
help='model folder to save/load',
default= WorkEnvironment('train') + '/model/')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
# After defining the training environment, first execute the WorkEnv function and the GetMultiDataPath function to
# copy multiple datasets from obs to the training image
environment = 'train'
workroot = WorkEnvironment(environment)
MultiObsToEnv(args.multi_data_url, workroot)

### Define the output path in the training image
train_dir = workroot + '/model'
if not os.path.exists(train_dir):
os.makedirs(train_dir)

context.set_context(mode=context.GRAPH_MODE,
device_target=args.device_target)
#The dataset path is used here:workroot + "/MNIST_Data" +"/train" ""
ds_train = create_dataset(os.path.join(workroot + "/MNISTData", "train"),
cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError(
"Please check dataset size > 0 and batch_size <= dataset size")
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

### Load the trained model:workroot + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt"
load_param_into_net(network, load_checkpoint(os.path.join(workroot + "/checkpoint_lenet-1_1875",
"checkpoint_lenet-1_1875.ckpt")))

if args.device_target != "Ascend":
model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
else:
model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")

config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=train_dir,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)

model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])
###Copy the trained model data from the local running environment back to obs,
###and download it in the training task corresponding to the Qizhi platform
EnvToObs(train_dir, args.train_url)

Loading…
Cancel
Save