diff --git a/config.py b/config.py deleted file mode 100755 index e191906..0000000 --- a/config.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -network config setting, will be used in train.py -""" - -from easydict import EasyDict as edict - -mnist_cfg = edict({ - 'num_classes': 10, - 'lr': 0.01, - 'momentum': 0.9, - 'epoch_size': 10, - 'batch_size': 32, - 'buffer_size': 1000, - 'image_height': 32, - 'image_width': 32, - 'save_checkpoint_steps': 1875, - 'keep_checkpoint_max': 10, - 'air_name': "lenet", -}) diff --git a/convert_pytorch.py b/convert_pytorch.py deleted file mode 100755 index 0aeb4ff..0000000 --- a/convert_pytorch.py +++ /dev/null @@ -1,59 +0,0 @@ -import torchvision -import torch -import argparse -from torch.autograd import Variable -import onnx -print(torch.__version__) - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -parser.add_argument('--model', - type=str, - help='path to training/inference dataset folder' - ) -parser.add_argument('--n', - type=int, - default=256, - help='batch size for input shape type' - ) -parser.add_argument('--c', - type=int, - default=1, - help='channel for input shape type' - ) -parser.add_argument('--h', - type=int, - default=28, - help='height for input shape type' - ) -parser.add_argument('--w', - type=int, - default=28, - help='width for input shape type' - ) - -if __name__ == "__main__": - args = parser.parse_args() - print('args:') - print(args) - - model_file = '/dataset/' + args.model - print(model_file) - model = torch.load(model_file) - print(model) - print(type(model)) - for k, v in model.named_parameters(): - print("k:",k) - print("v:",v.shape) - - suffix = args.model.rindex(".") - out_file = '/model/' + args.model + ".onnx" - if suffix!=-1 : - out_file = '/model/' + args.model[0:suffix] + ".onnx" - print(out_file) - input_name = ['input'] - output_name = ['output'] - input = Variable(torch.randn(args.n, args.c, args.h, args.w)) - torch.onnx.export(model, input, out_file, input_names=input_name, output_names=output_name, verbose=True) - - diff --git a/convert_to_onnx.py b/convert_to_onnx.py deleted file mode 100755 index 4390052..0000000 --- a/convert_to_onnx.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np -from mindspore import Tensor, export, load_checkpoint -from mindvision.classification.models import resnet50 -from mindvision.dataset import DownLoad - -# 下载Resnet50的预训练模型 -dl = DownLoad() -dl.download_url('https://download.mindspore.cn/vision/classification/resnet50_224.ckpt') - -resnet = resnet50(1000) -load_checkpoint("resnet50_224.ckpt", net=resnet) - -input_np = np.random.uniform(0.0, 1.0, size=[1, 3, 224, 224]).astype(np.float32) - -# 导出文件resnet50_224.mindir到当前文件夹 -export(resnet, Tensor(input_np), file_name='resnet50_224', file_format='MINDIR') - - - -# 保存resnet50_224.onnx文件到当前目录下 -export(resnet, Tensor(input_np), file_name='resnet50_224', file_format='ONNX') - diff --git a/dataset.py b/dataset.py deleted file mode 100755 index df9eecd..0000000 --- a/dataset.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Produce the dataset -""" - -import mindspore.dataset as ds -import mindspore.dataset.vision.c_transforms as CV -import mindspore.dataset.transforms.c_transforms as C -from mindspore.dataset.vision import Inter -from mindspore.common import dtype as mstype - - -def create_dataset(data_path, batch_size=32, repeat_size=1, - num_parallel_workers=1): - """ - create dataset for train or test - """ - # define dataset - mnist_ds = ds.MnistDataset(data_path) - - resize_height, resize_width = 32, 32 - rescale = 1.0 / 255.0 - shift = 0.0 - rescale_nml = 1 / 0.3081 - shift_nml = -1 * 0.1307 / 0.3081 - - # define map operations - resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode - rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) - rescale_op = CV.Rescale(rescale, shift) - hwc2chw_op = CV.HWC2CHW() - type_cast_op = C.TypeCast(mstype.int32) - - # apply map operations on images - mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) - mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) - - # apply DatasetOps - buffer_size = 10000 - mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script - mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) - mnist_ds = mnist_ds.repeat(repeat_size) - - return mnist_ds diff --git a/gpu_train.py b/gpu_train.py deleted file mode 100755 index 9a80582..0000000 --- a/gpu_train.py +++ /dev/null @@ -1,74 +0,0 @@ -''' -由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, -本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 -提交镜像,再切到训练环境训练已跑通的代码。 -在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model, -启智平台界面会提供/model目录下的文件下载。 -''' - - -from model import Model -import numpy as np -import torch -from torchvision.datasets import mnist -from torch.nn import CrossEntropyLoss -from torch.optim import SGD -from torch.utils.data import DataLoader -from torchvision.transforms import ToTensor -import argparse - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -#数据集位置放在/dataset下 -parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') -parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') -parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') -parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') - -if __name__ == '__main__': - args = parser.parse_args() - #日志输出 - print('cuda is available:{}'.format(torch.cuda.is_available())) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - batch_size = args.batch_size - train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) - test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) - train_loader = DataLoader(train_dataset, batch_size=batch_size) - test_loader = DataLoader(test_dataset, batch_size=batch_size) - model = Model().to(device) - sgd = SGD(model.parameters(), lr=1e-1) - cost = CrossEntropyLoss() - epoch = args.epoch_size - #日志输出 - print('epoch_size is:{}'.format(epoch)) - for _epoch in range(epoch): - print('the {} epoch_size begin'.format(_epoch + 1)) - model.train() - for idx, (train_x, train_label) in enumerate(train_loader): - train_x = train_x.to(device) - train_label = train_label.to(device) - label_np = np.zeros((train_label.shape[0], 10)) - sgd.zero_grad() - predict_y = model(train_x.float()) - loss = cost(predict_y, train_label.long()) - if idx % 10 == 0: - print('idx: {}, loss: {}'.format(idx, loss.sum().item())) - loss.backward() - sgd.step() - - correct = 0 - _sum = 0 - model.eval() - for idx, (test_x, test_label) in enumerate(test_loader): - test_x = test_x - test_label = test_label - predict_y = model(test_x.to(device).float()).detach() - predict_ys = np.argmax(predict_y.cpu(), axis=-1) - label_np = test_label.numpy() - _ = predict_ys == test_label - correct += np.sum(_.numpy(), axis=-1) - _sum += _.shape[0] - #日志输出 - print('accuracy: {:.2f}'.format(correct / _sum)) - #模型输出位置放在/model下 - torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/gpu_train_resnet50.py b/gpu_train_resnet50.py deleted file mode 100755 index 9bf6157..0000000 --- a/gpu_train_resnet50.py +++ /dev/null @@ -1,30 +0,0 @@ -''' -由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, -本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 -提交镜像,再切到训练环境训练已跑通的代码。 -在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model, -启智平台界面会提供/model目录下的文件下载。 -''' - -import torchvision -from torch.autograd import Variable -import torch -import argparse - -# Training settings -parser = argparse.ArgumentParser(description='Resnet50 Example') -#数据集位置放在/dataset下 -parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') -parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') -parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') -parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') - -if __name__ == '__main__': - input_name = ['input'] - output_name = ['output'] - input = Variable(torch.randn(1, 3, 224, 224)).cuda() - model = torchvision.models.resnet50(pretrained=True).cuda() - - #模型输出位置放在/model下 - torch.save(model, '/model/resnet50.pth') - diff --git a/grampus_tf_train.py b/grampus_tf_train.py deleted file mode 100755 index 86bb261..0000000 --- a/grampus_tf_train.py +++ /dev/null @@ -1,154 +0,0 @@ -# coding: utf-8 -import tensorflow as tf -from tensorflow.examples.tutorials.mnist import input_data -import os -import argparse -import moxing as mox - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -workroot = '/cache/' -#初始化过滤器 -def weight_variable(shape): - return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) - -#初始化偏置,初始化时,所有值是0.1 -def bias_variable(shape): - return tf.Variable(tf.constant(0.1, shape=shape)) - -#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 -#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 -def conv2d(x, W): - return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") - - -#池化运算 -def max_pool_2x2(x): - - return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") - -def parse_args(): - parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -# define 2 parameters for running on modelArts -# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 - parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= workroot + '/dataset/') - - parser.add_argument('--train_url', - help='model folder to save/load', - default= workroot + '/output/') - parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') - -#modelarts已经默认使用data_url和train_url - parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - - args = parser.parse_args() - return args - - -if __name__ == "__main__": - args = parse_args() - print('args:') - print(args) - - mnist = input_data.read_data_sets('mnist_data', one_hot=True) - - #创建x占位符,用于临时存放MNIST图片的数据, - # [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) - x = tf.placeholder(tf.float32, [None, 784], name='input') - #y_存的是实际图像的标签,即对应于每张输入图片实际的值 - y_ = tf.placeholder(tf.float32, [None, 10]) - - #将图片从784维向量重新还原为28×28的矩阵图片, - # 原因参考卷积神经网络模型图,最后一个参数代表深度, - # 因为MNIST是黑白图片,所以深度为1, - # 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 - x_image = tf.reshape(x, [-1, 28, 28, 1]) - - #第一层卷积 - #将过滤器设置成5×5×1的矩阵, - #其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 - #32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 - W_conv1 = weight_variable([5, 5, 1, 32]) - #有多少个特征图就有多少个偏置 - b_conv1 = bias_variable([32]) - #使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 - h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) - #卷积以后再经过池化操作 - h_pool1 = max_pool_2x2(h_conv1) - - #第二层卷积 - #因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 - W_conv2 = weight_variable([5, 5, 32, 64]) - b_conv2 = bias_variable([64]) - h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) - h_pool2 = max_pool_2x2(h_conv2) - - #全连接层 - #经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), - #第二层池化后输出为(14/2)×(14/2)),深度为64, - #我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] - W_fc1 = weight_variable([7 * 7 * 64, 1024]) - #偏置的个数和权重的个数一致 - b_fc1 = bias_variable([1024]) - #这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) - h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) - #使用ReLU激活函数 - h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) - - #dropout - #为了减少过拟合,我们在输出层之前加入dropout - keep_prob = tf.placeholder(tf.float32, name='keep_prob') - h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) - - #输出层 - #全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), - # 所以这里权重W的尺寸为[1024, 10] - W_fc2 = weight_variable([1024, 10]) - b_fc2 = bias_variable([10]) - - #最后都要经过Softmax函数将输出转化为概率问题 - y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') - - #损失函数和损失优化 - cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) - train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) - - #测试准确率,跟Softmax回归模型的一样 - correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - - train_dir = '/cache/output/' #模型存放路径 - if not os.path.exists(train_dir): - os.mkdir(train_dir) - obs_train_url = args.train_url - #开始训练 - with tf.Session() as sess: - #初始化所有变量 - sess.run(tf.global_variables_initializer()) - #训练两万次 - for i in range(2000): - #每次获取50张图片数据和对应的标签 - batch = mnist.train.next_batch(50) - #每训练100次,我们打印一次训练的准确率 - if i % 100 == 0: - train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) - print("step %d, training accuracy %g" % (i, train_accuracy)) - #这里是真的训练,将数据传入 - sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) - - # 用SavedModel的方式保存 - tf.compat.v1.saved_model.simple_save(sess, - train_dir +"saved_model", - inputs={"input": x, 'keep_prob':keep_prob}, - outputs={"output": y_conv}) - diff --git a/grampus_train.py b/grampus_train.py deleted file mode 100755 index 81116b7..0000000 --- a/grampus_train.py +++ /dev/null @@ -1,74 +0,0 @@ -''' -由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码, -本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并 -提交镜像,再切到训练环境训练已跑通的代码。 -在训练环境中,上传的数据集会自动放在/dataset目录下,模型下载路径默认在/model下,请将模型输出位置指定到/model, -启智平台界面会提供/model目录下的文件下载。 -''' - - -from model import Model -import numpy as np -import torch -from torchvision.datasets import mnist -from torch.nn import CrossEntropyLoss -from torch.optim import SGD -from torch.utils.data import DataLoader -from torchvision.transforms import ToTensor -import argparse - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -#数据集位置放在/dataset下 -parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') -parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') -parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') -parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') - -if __name__ == '__main__': - args = parser.parse_args() - #日志输出 - print('cuda is available:{}'.format(torch.cuda.is_available())) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - batch_size = args.batch_size - train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) - test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) - train_loader = DataLoader(train_dataset, batch_size=batch_size) - test_loader = DataLoader(test_dataset, batch_size=batch_size) - model = Model().to(device) - sgd = SGD(model.parameters(), lr=1e-1) - cost = CrossEntropyLoss() - epoch = args.epoch_size - #日志输出 - print('epoch_size is:{}'.format(epoch)) - for _epoch in range(epoch): - print('the {} epoch_size begin'.format(_epoch + 1)) - model.train() - for idx, (train_x, train_label) in enumerate(train_loader): - train_x = train_x.to(device) - train_label = train_label.to(device) - label_np = np.zeros((train_label.shape[0], 10)) - sgd.zero_grad() - predict_y = model(train_x.float()) - loss = cost(predict_y, train_label.long()) - if idx % 10 == 0: - print('idx: {}, loss: {}'.format(idx, loss.sum().item())) - loss.backward() - sgd.step() - - correct = 0 - _sum = 0 - model.eval() - for idx, (test_x, test_label) in enumerate(test_loader): - test_x = test_x - test_label = test_label - predict_y = model(test_x.to(device).float()).detach() - predict_ys = np.argmax(predict_y.cpu(), axis=-1) - label_np = test_label.numpy() - _ = predict_ys == test_label - correct += np.sum(_.numpy(), axis=-1) - _sum += _.shape[0] - #日志输出 - print('accuracy: {:.2f}'.format(correct / _sum)) - #模型输出位置放在/model下 - torch.save(model, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) \ No newline at end of file diff --git a/inference.py b/inference.py deleted file mode 100755 index 3fb5271..0000000 --- a/inference.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -######################## inference lenet example ######################## -inference lenet according to model file -""" - -""" -######################## 推理环境使用说明 ######################## -1、在推理环境中,需要将数据集从obs拷贝到推理镜像中,推理完以后,需要将输出的结果拷贝到obs. -(1)将数据集从obs拷贝到推理镜像中: - obs_data_url = args.data_url - args.data_url = '/home/work/user-job-dir/data/' - if not os.path.exists(args.data_url): - os.mkdir(args.data_url) - try: - mox.file.copy_parallel(obs_data_url, args.data_url) - print("Successfully Download {} to {}".format(obs_data_url, - args.data_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_data_url, args.data_url) + str(e)) - -(2)将模型文件从obs拷贝到推理镜像中: - obs_ckpt_url = args.ckpt_url - args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt' - try: - mox.file.copy(obs_ckpt_url, args.ckpt_url) - print("Successfully Download {} to {}".format(obs_ckpt_url, - args.ckpt_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_ckpt_url, args.ckpt_url) + str(e)) - -(3)将输出的结果拷贝回obs: - obs_result_url = args.result_url - args.result_url = '/home/work/user-job-dir/result/' - if not os.path.exists(args.result_url): - os.mkdir(args.result_url) - try: - mox.file.copy_parallel(args.result_url, obs_result_url) - print("Successfully Upload {} to {}".format(args.result_url, obs_result_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e)) -详细代码可参考以下示例代码: -""" - -import os -import argparse -import moxing as mox -import mindspore.nn as nn -from mindspore import context -from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore import Tensor -import numpy as np -from glob import glob -from dataset import create_dataset -from config import mnist_cfg as cfg -from lenet import LeNet5 - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], - help='device where the code will be implemented (default: Ascend)') - parser.add_argument('--data_url', - type=str, - default="./Data", - help='path where the dataset is saved') - parser.add_argument('--ckpt_url', - help='model to save/load', - default='./ckpt_url') - parser.add_argument('--result_url', - help='result folder to save/load', - default='./result') - - args = parser.parse_args() - - #将数据集从obs拷贝到推理镜像中: - obs_data_url = args.data_url - args.data_url = '/home/work/user-job-dir/data/' - if not os.path.exists(args.data_url): - os.mkdir(args.data_url) - try: - mox.file.copy_parallel(obs_data_url, args.data_url) - print("Successfully Download {} to {}".format(obs_data_url, - args.data_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_data_url, args.data_url) + str(e)) - - #对文件夹进行操作,请使用mox.file.copy_parallel。如果拷贝一个文件。请使用mox.file.copy对文件操作,本次操作是对文件进行操作 - #将模型文件从obs拷贝到推理镜像中: - obs_ckpt_url = args.ckpt_url - args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt' - try: - mox.file.copy(obs_ckpt_url, args.ckpt_url) - print("Successfully Download {} to {}".format(obs_ckpt_url, - args.ckpt_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_ckpt_url, args.ckpt_url) + str(e)) - - #设置输出路径result_url - obs_result_url = args.result_url - args.result_url = '/home/work/user-job-dir/result/' - if not os.path.exists(args.result_url): - os.mkdir(args.result_url) - - args.dataset_path = args.data_url - args.save_checkpoint_path = args.ckpt_url - - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - repeat_size = cfg.epoch_size - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) - - print("============== Starting Testing ==============") - args.load_ckpt_url = os.path.join(args.save_checkpoint_path) - print("args.load_ckpt_url is:{}", args.load_ckpt_url ) - param_dict = load_checkpoint(args.load_ckpt_url ) - load_param_into_net(network, param_dict) - # 定义测试数据集,batch_size设置为1,则取出一张图片 - ds_test = create_dataset(os.path.join(args.dataset_path, "test"), batch_size=1).create_dict_iterator() - data = next(ds_test) - - # images为测试图片,labels为测试图片的实际分类 - images = data["image"].asnumpy() - labels = data["label"].asnumpy() - print('Tensor:', Tensor(data['image'])) - - # 使用函数model.predict预测image对应分类 - output = model.predict(Tensor(data['image'])) - predicted = np.argmax(output.asnumpy(), axis=1) - pred = np.argmax(output.asnumpy(), axis=1) - print('predicted:', predicted) - print('pred:', pred) - - # 输出预测分类与实际分类,并输出到result_url - print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') - filename = 'result.txt' - file_path = os.path.join(args.result_url, filename) - with open(file_path, 'a+') as file: - file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) - - # Upload results to obs - ######################## 将输出的结果拷贝到obs(固定写法) ######################## - # 把推理后的结果从本地的运行环境拷贝回obs,在启智平台相对应的推理任务中会提供下载 - try: - mox.file.copy_parallel(args.result_url, obs_result_url) - print("Successfully Upload {} to {}".format(args.result_url, obs_result_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e)) - ######################## 将输出的模型拷贝到obs ######################## \ No newline at end of file diff --git a/lenet.py b/lenet.py deleted file mode 100755 index 0600793..0000000 --- a/lenet.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -"""LeNet.""" -import mindspore.nn as nn -from mindspore.common.initializer import Normal - - -class LeNet5(nn.Cell): - """ - Lenet network - - Args: - num_class (int): Number of classes. Default: 10. - num_channel (int): Number of channels. Default: 1. - - Returns: - Tensor, output tensor - Examples: - >>> LeNet(num_class=10) - - """ - def __init__(self, num_class=10, num_channel=1, include_top=True): - super(LeNet5, self).__init__() - self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') - self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') - self.relu = nn.ReLU() - self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) - self.include_top = include_top - if self.include_top: - self.flatten = nn.Flatten() - self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) - self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) - self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) - - def construct(self, x): - x = self.conv1(x) - x = self.relu(x) - x = self.max_pool2d(x) - x = self.conv2(x) - x = self.relu(x) - x = self.max_pool2d(x) - if not self.include_top: - return x - x = self.flatten(x) - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - x = self.fc3(x) - return x diff --git a/model.py b/model.py deleted file mode 100755 index 9a7f565..0000000 --- a/model.py +++ /dev/null @@ -1,35 +0,0 @@ -from torch.nn import Module -from torch import nn - - -class Model(Module): - def __init__(self): - super(Model, self).__init__() - self.conv1 = nn.Conv2d(1, 6, 5) - self.relu1 = nn.ReLU() - self.pool1 = nn.MaxPool2d(2) - self.conv2 = nn.Conv2d(6, 16, 5) - self.relu2 = nn.ReLU() - self.pool2 = nn.MaxPool2d(2) - self.fc1 = nn.Linear(256, 120) - self.relu3 = nn.ReLU() - self.fc2 = nn.Linear(120, 84) - self.relu4 = nn.ReLU() - self.fc3 = nn.Linear(84, 10) - self.relu5 = nn.ReLU() - - def forward(self, x): - y = self.conv1(x) - y = self.relu1(y) - y = self.pool1(y) - y = self.conv2(y) - y = self.relu2(y) - y = self.pool2(y) - y = y.view(y.shape[0], -1) - y = self.fc1(y) - y = self.relu3(y) - y = self.fc2(y) - y = self.relu4(y) - y = self.fc3(y) - y = self.relu5(y) - return y diff --git a/running.py b/running.py deleted file mode 100755 index 70f957c..0000000 --- a/running.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/python -#-*- coding: UTF-8 -*- -import time -import datetime - -timeStart = datetime.datetime.now() -print(timeStart.strftime('%Y-%m-%d %H:%M:%S')) -for letter in 'Python': - print('当前字母:%s' % letter) - print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - time.sleep(30) - - -timeEnd = datetime.datetime.now() -print(timeEnd.strftime('%Y-%m-%d %H:%M:%S')) -print('经历多少秒:%s' % (timeEnd - timeStart).seconds) \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100755 index 2621bc3..0000000 --- a/test.py +++ /dev/null @@ -1 +0,0 @@ -print('for test only') \ No newline at end of file diff --git a/test_c2net_npu.py b/test_c2net_npu.py deleted file mode 100755 index 1197b1a..0000000 --- a/test_c2net_npu.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/python -#coding=utf-8 - -""" -######################## train lenet example ######################## -train lenet and get network model files(.ckpt) -""" - -import os -import argparse -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore import context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.common import set_seed - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') - -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -set_seed(1) - -if __name__ == "__main__": - args = parser.parse_args() - print('args:') - print(args) - - # train_dir = '/tmp/output' - # data_dir = '/tmp/dataset' - train_dir = '/cache/output' - data_dir = '/cache/dataset' - - #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - #创建数据集 - ds_train = create_dataset(os.path.join(data_dir, "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - #创建网络 - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - #定义模型输出路径 - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", - directory=train_dir, - config=config_ck) - #开始训练 - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - - print("============== Finish Training ==============") \ No newline at end of file diff --git a/tf_train.py b/tf_train.py deleted file mode 100755 index 31d884d..0000000 --- a/tf_train.py +++ /dev/null @@ -1,158 +0,0 @@ -# coding: utf-8 -import tensorflow as tf -from tensorflow.examples.tutorials.mnist import input_data -import os -import argparse -import moxing as mox - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -workroot = '/home/work/user-job-dir' -#初始化过滤器 -def weight_variable(shape): - return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) - -#初始化偏置,初始化时,所有值是0.1 -def bias_variable(shape): - return tf.Variable(tf.constant(0.1, shape=shape)) - -#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 -#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 -def conv2d(x, W): - return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") - - -#池化运算 -def max_pool_2x2(x): - - return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") - -def parse_args(): - parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -# define 2 parameters for running on modelArts -# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 - parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= workroot + '/data/') - - parser.add_argument('--train_url', - help='model folder to save/load', - default= workroot + '/model/') - parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') - -#modelarts已经默认使用data_url和train_url - parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - - args = parser.parse_args() - return args - - -if __name__ == "__main__": - args = parse_args() - print('args:') - print(args) - - mnist = input_data.read_data_sets('mnist_data', one_hot=True) - - #创建x占位符,用于临时存放MNIST图片的数据, - # [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) - x = tf.placeholder(tf.float32, [None, 784], name='input') - #y_存的是实际图像的标签,即对应于每张输入图片实际的值 - y_ = tf.placeholder(tf.float32, [None, 10]) - - #将图片从784维向量重新还原为28×28的矩阵图片, - # 原因参考卷积神经网络模型图,最后一个参数代表深度, - # 因为MNIST是黑白图片,所以深度为1, - # 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 - x_image = tf.reshape(x, [-1, 28, 28, 1]) - - #第一层卷积 - #将过滤器设置成5×5×1的矩阵, - #其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 - #32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 - W_conv1 = weight_variable([5, 5, 1, 32]) - #有多少个特征图就有多少个偏置 - b_conv1 = bias_variable([32]) - #使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 - h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) - #卷积以后再经过池化操作 - h_pool1 = max_pool_2x2(h_conv1) - - #第二层卷积 - #因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 - W_conv2 = weight_variable([5, 5, 32, 64]) - b_conv2 = bias_variable([64]) - h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) - h_pool2 = max_pool_2x2(h_conv2) - - #全连接层 - #经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), - #第二层池化后输出为(14/2)×(14/2)),深度为64, - #我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] - W_fc1 = weight_variable([7 * 7 * 64, 1024]) - #偏置的个数和权重的个数一致 - b_fc1 = bias_variable([1024]) - #这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) - h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) - #使用ReLU激活函数 - h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) - - #dropout - #为了减少过拟合,我们在输出层之前加入dropout - keep_prob = tf.placeholder(tf.float32, name='keep_prob') - h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) - - #输出层 - #全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), - # 所以这里权重W的尺寸为[1024, 10] - W_fc2 = weight_variable([1024, 10]) - b_fc2 = bias_variable([10]) - - #最后都要经过Softmax函数将输出转化为概率问题 - y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') - - #损失函数和损失优化 - cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) - train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) - - #测试准确率,跟Softmax回归模型的一样 - correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - - train_dir = workroot + '/model/' #模型存放路径 - if not os.path.exists(train_dir): - os.mkdir(train_dir) - obs_train_url = args.train_url - #开始训练 - with tf.Session() as sess: - #初始化所有变量 - sess.run(tf.global_variables_initializer()) - #训练两万次 - for i in range(2000): - #每次获取50张图片数据和对应的标签 - batch = mnist.train.next_batch(50) - #每训练100次,我们打印一次训练的准确率 - if i % 100 == 0: - train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) - print("step %d, training accuracy %g" % (i, train_accuracy)) - #这里是真的训练,将数据传入 - sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) - - # 用SavedModel的方式保存 - tf.compat.v1.saved_model.simple_save(sess, - train_dir +"saved_model", - inputs={"input": x, 'keep_prob':keep_prob}, - outputs={"output": y_conv}) - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) diff --git a/tf_train_new.py b/tf_train_new.py deleted file mode 100755 index 119cec1..0000000 --- a/tf_train_new.py +++ /dev/null @@ -1,146 +0,0 @@ -# coding: utf-8 -import tensorflow as tf -from tensorflow.examples.tutorials.mnist import input_data -import os - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - -mnist = input_data.read_data_sets('mnist_data', one_hot=True) - -#初始化过滤器 -def weight_variable(shape): - return tf.Variable(tf.truncated_normal(shape, stddev=0.1)) - -#初始化偏置,初始化时,所有值是0.1 -def bias_variable(shape): - return tf.Variable(tf.constant(0.1, shape=shape)) - -#卷积运算,strides表示每一维度滑动的步长,一般strides[0]=strides[3]=1 -#第四个参数可选"Same"或"VALID",“Same”表示边距使用全0填充 -def conv2d(x, W): - return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding="SAME") - - -#池化运算 -def max_pool_2x2(x): - - return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME") - -#创建x占位符,用于临时存放MNIST图片的数据, -# [None, 784]中的None表示不限长度,而784则是一张图片的大小(28×28=784) -x = tf.placeholder(tf.float32, [None, 784], name='input') -#y_存的是实际图像的标签,即对应于每张输入图片实际的值 -y_ = tf.placeholder(tf.float32, [None, 10]) - -#将图片从784维向量重新还原为28×28的矩阵图片, -# 原因参考卷积神经网络模型图,最后一个参数代表深度, -# 因为MNIST是黑白图片,所以深度为1, -# 第一个参数为-1,表示一维的长度不限定,这样就可以灵活设置每个batch的训练的个数了 -x_image = tf.reshape(x, [-1, 28, 28, 1]) - -#第一层卷积 -#将过滤器设置成5×5×1的矩阵, -#其中5×5表示过滤器大小,1表示深度,因为MNIST是黑白图片只有一层。所以深度为1 -#32表示我们要创建32个大小5×5×1的过滤器,经过卷积后算出32个特征图(每个过滤器得到一个特征图),即输出深度为64 -W_conv1 = weight_variable([5, 5, 1, 32]) -#有多少个特征图就有多少个偏置 -b_conv1 = bias_variable([32]) -#使用conv2d函数进行卷积计算,然后再用ReLU作为激活函数 -h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) -#卷积以后再经过池化操作 -h_pool1 = max_pool_2x2(h_conv1) - -#第二层卷积 -#因为经过第一层卷积运算后,输出的深度为32,所以过滤器深度和下一层输出深度也做出改变 -W_conv2 = weight_variable([5, 5, 32, 64]) -b_conv2 = bias_variable([64]) -h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) -h_pool2 = max_pool_2x2(h_conv2) - -#全连接层 -#经过两层卷积后,图片的大小为7×7(第一层池化后输出为(28/2)×(28/2), -#第二层池化后输出为(14/2)×(14/2)),深度为64, -#我们在这里加入一个有1024个神经元的全连接层,所以权重W的尺寸为[7 * 7 * 64, 1024] -W_fc1 = weight_variable([7 * 7 * 64, 1024]) -#偏置的个数和权重的个数一致 -b_fc1 = bias_variable([1024]) -#这里将第二层池化后的张量(长:7 宽:7 深度:64) 变成向量(跟上一节的Softmax模型的输入一样了) -h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) -#使用ReLU激活函数 -h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) - -#dropout -#为了减少过拟合,我们在输出层之前加入dropout -keep_prob = tf.placeholder(tf.float32, name='keep_prob') -h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) - -#输出层 -#全连接层输入的大小为1024,而我们要得到的结果的大小是10(0~9), -# 所以这里权重W的尺寸为[1024, 10] -W_fc2 = weight_variable([1024, 10]) -b_fc2 = bias_variable([10]) - -#最后都要经过Softmax函数将输出转化为概率问题 -y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2, name='output') - -#损失函数和损失优化 -cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv))) -train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) - -#测试准确率,跟Softmax回归模型的一样 -correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) -accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - -# #将训练结果保存,如果不保存我们这次训练结束后的结果也随着程序运行结束而释放了 -# savePath = './mnist_conv/' -# saveFile = savePath + 'mnist_conv.ckpt' -# if os.path.exists(savePath) == False: -# os.mkdir(savePath) - -# saver = tf.train.Saver() - - -#开始训练 -with tf.Session() as sess: - - #初始化所有变量 - sess.run(tf.global_variables_initializer()) - - #训练两万次 - for i in range(2000): - - #每次获取50张图片数据和对应的标签 - batch = mnist.train.next_batch(50) - - #每训练100次,我们打印一次训练的准确率 - if i % 100 == 0: - train_accuracy =sess.run(accuracy, feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0}) - print("step %d, training accuracy %g" % (i, train_accuracy)) - - #这里是真的训练,将数据传入 - sess.run(train_step, feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5}) - - - - # print ("end train, start testing...") - # mean_value = 0.0 - # for i in range(mnist.test.labels.shape[0]): - # batch = mnist.test.next_batch(50) - # train_accuracy = sess.run(accuracy, feed_dict={x: batch[0], y_: batch[1], keep_prob: 1.0}) - # mean_value += train_accuracy - - - - # print("test accuracy %g" % (mean_value / mnist.test.labels.shape[0])) - # #训练结束后,我们使用mnist.test在测试最后的准确率 - # print("test accuracy %g" % sess.run(accuracy, feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0})) - - - # 最后,将会话保存下来 - # saver.save(sess, saveFile) - - # 用SavedModel的方式保存 - tf.compat.v1.saved_model.simple_save(sess, - "./saved_model", - inputs={"input": x, 'keep_prob':keep_prob}, - outputs={"output": y_conv}) diff --git a/train.py b/train.py deleted file mode 100755 index 4b8ba9b..0000000 --- a/train.py +++ /dev/null @@ -1,185 +0,0 @@ -""" -######################## train lenet example ######################## -train lenet and get network model files(.ckpt) -""" -""" -######################## 训练环境使用说明 ######################## -假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作: -1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换 -在调试环境中: -args.data_url = '/home/ma-user/work/data/' //数据集位置 -args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置 -在训练环境变换为: -args.data_url = '/home/work/user-job-dir/data/' -args.train_url = '/home/work/user-job-dir/model/' -2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs. -将数据集从obs拷贝到训练镜像中: - - obs_data_url = args.data_url - args.data_url = '/home/work/user-job-dir/data/' - if not os.path.exists(args.data_url): - os.mkdir(args.data_url) - try: - mox.file.copy_parallel(obs_data_url, args.data_url) - print("Successfully Download {} to {}".format(obs_data_url, - args.data_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_data_url, args.data_url) + str(e)) - -将输出的模型拷贝到obs: - obs_train_url = args.train_url - args.train_url = '/home/work/user-job-dir/model/' - if not os.path.exists(args.train_url): - os.mkdir(args.train_url) -try: - mox.file.copy_parallel(args.train_url, obs_train_url) - print("Successfully Upload {} to {}".format(args.train_url, - obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(args.train_url, - obs_train_url) + str(e)) - -""" - -import os -import numpy as np -import argparse -import moxing as mox -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore import context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.common import set_seed -from mindspore import Tensor, export - -#配置默认的工作空间根目录 -# environment = 'debug' -environment = 'train' -if environment == 'debug': - workroot = '/home/ma-user/work' #调试任务使用该参数 -else: - workroot = '/home/work/user-job-dir' # 训练任务使用该参数 -print('current work mode:' + environment + ', workroot:' + workroot) - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -# define 2 parameters for running on modelArts -# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 -parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= workroot + '/data/') - -parser.add_argument('--train_url', - help='model folder to save/load', - default= workroot + '/model/') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') - -#modelarts已经默认使用data_url和train_url -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -set_seed(1) - -if __name__ == "__main__": - args = parser.parse_args() - print('args:') - print(args) - - data_dir = workroot + '/data' #数据集存放路径 - train_dir = workroot + '/model' #模型存放路径 - #初始化数据存放目录 - if not os.path.exists(data_dir): - os.mkdir(data_dir) - #初始化模型存放目录 - obs_train_url = args.train_url - train_dir = workroot + '/model/' - if not os.path.exists(train_dir): - os.mkdir(train_dir) - ######################## 将数据集从obs拷贝到训练镜像中 (固定写法)######################## - # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录 - #创建数据存放的位置 - if environment == 'train': - obs_data_url = args.data_url - #将数据拷贝到训练环境 - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, - data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_data_url, data_dir) + str(e)) -######################## 将数据集从obs拷贝到训练镜像中 ######################## - - #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - #创建数据集 - ds_train = create_dataset(os.path.join(data_dir, "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - #创建网络 - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - #定义模型输出路径 - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", - directory=train_dir, - config=config_ck) - #开始训练 - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - input = np.random.uniform(0.0, 1.0, size=[1, 1, 32, 32]).astype(np.float32) - export(network, Tensor(input), file_name=(train_dir +'LeNet5_model'), file_format='MINDIR') - - export(network, Tensor(input), file_name=(train_dir +'LeNet5_onnx_model'), file_format='ONNX') - ######################## 将输出的模型拷贝到obs(固定写法) ######################## - # 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载 - if environment == 'train': - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir, - obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir, - obs_train_url) + str(e)) - ######################## 将输出的模型拷贝到obs ######################## diff --git a/train_longparam.py b/train_longparam.py deleted file mode 100755 index 7546110..0000000 --- a/train_longparam.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -######################## train lenet example ######################## -train lenet and get network model files(.ckpt) -""" -""" -######################## 训练环境使用说明 ######################## -假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作: -1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换 -在调试环境中: -args.data_url = '/home/ma-user/work/data/' //数据集位置 -args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置 -在训练环境变换为: -args.data_url = '/home/work/user-job-dir/data/' -args.train_url = '/home/work/user-job-dir/model/' -2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs. -将数据集从obs拷贝到训练镜像中: - - obs_data_url = args.data_url - args.data_url = '/home/work/user-job-dir/data/' - if not os.path.exists(args.data_url): - os.mkdir(args.data_url) - try: - mox.file.copy_parallel(obs_data_url, args.data_url) - print("Successfully Download {} to {}".format(obs_data_url, - args.data_url)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_data_url, args.data_url) + str(e)) - -将输出的模型拷贝到obs: - obs_train_url = args.train_url - args.train_url = '/home/work/user-job-dir/model/' - if not os.path.exists(args.train_url): - os.mkdir(args.train_url) -try: - mox.file.copy_parallel(args.train_url, obs_train_url) - print("Successfully Upload {} to {}".format(args.train_url, - obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(args.train_url, - obs_train_url) + str(e)) - -""" - -import os -import numpy as np -import argparse -import moxing as mox -from config import mnist_cfg as cfg -from dataset import create_dataset -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore import context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.common import set_seed -from mindspore import Tensor, export - -#配置默认的工作空间根目录 -# environment = 'debug' -environment = 'train' -if environment == 'debug': - workroot = '/home/ma-user/work' #调试任务使用该参数 -else: - workroot = '/home/work/user-job-dir' # 训练任务使用该参数 -print('current work mode:' + environment + ', workroot:' + workroot) - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -# define 2 parameters for running on modelArts -# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 -parser.add_argument('--data_url', - help='path to training/inference dataset folder', - default= workroot + '/data/') - -parser.add_argument('--train_url', - help='model folder to save/load', - default= workroot + '/model/') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') - -#modelarts已经默认使用data_url和train_url -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -parser.add_argument('--openI', - help='model folder to save/load', - default= True) -parser.add_argument('--sink_mode', - help='model folder to save/load', - default= True) -parser.add_argument('--dataset', - help='model folder to save/load', - default= 'hmdb51') -parser.add_argument('--checkpoint_path', - help='model folder to save/load', - default= './src/pretrained/rgb_imagenet.ckpt') -parser.add_argument('--mode', - help='model folder to save/load', - default= 'rgb') -parser.add_argument('--num_epochs', - help='model folder to save/load', - default= 40) -parser.add_argument('--distributed', - help='model folder to save/load', - default= True) - -set_seed(1) - -if __name__ == "__main__": - args = parser.parse_args() - print('args:') - print(args) - - data_dir = workroot + '/data' #数据集存放路径 - train_dir = workroot + '/model' #模型存放路径 - #初始化数据存放目录 - if not os.path.exists(data_dir): - os.mkdir(data_dir) - #初始化模型存放目录 - obs_train_url = args.train_url - train_dir = workroot + '/model/' - if not os.path.exists(train_dir): - os.mkdir(train_dir) - ######################## 将数据集从obs拷贝到训练镜像中 (固定写法)######################## - # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径,以下写法是将数据拷贝到/home/work/user-job-dir/data/目录下,可修改为其他目录 - #创建数据存放的位置 - if environment == 'train': - obs_data_url = args.data_url - #将数据拷贝到训练环境 - try: - mox.file.copy_parallel(obs_data_url, data_dir) - print("Successfully Download {} to {}".format(obs_data_url, - data_dir)) - except Exception as e: - print('moxing download {} to {} failed: '.format( - obs_data_url, data_dir) + str(e)) -######################## 将数据集从obs拷贝到训练镜像中 ######################## - - #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU - context.set_context(mode=context.GRAPH_MODE, - device_target=args.device_target) - #创建数据集 - ds_train = create_dataset(os.path.join(data_dir, "train"), - cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError( - "Please check dataset size > 0 and batch_size <= dataset size") - #创建网络 - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - #定义模型输出路径 - ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", - directory=train_dir, - config=config_ck) - #开始训练 - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - input = np.random.uniform(0.0, 1.0, size=[1, 1, 32, 32]).astype(np.float32) - export(network, Tensor(input), file_name=(train_dir +'LeNet5_model'), file_format='MINDIR') - - export(network, Tensor(input), file_name=(train_dir +'LeNet5_onnx_model'), file_format='ONNX') - ######################## 将输出的模型拷贝到obs(固定写法) ######################## - # 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载 - if environment == 'train': - try: - mox.file.copy_parallel(train_dir, obs_train_url) - print("Successfully Upload {} to {}".format(train_dir, - obs_train_url)) - except Exception as e: - print('moxing upload {} to {} failed: '.format(train_dir, - obs_train_url) + str(e)) - ######################## 将输出的模型拷贝到obs ########################