diff --git a/OpenI云脑使用教程.ipynb b/OpenI云脑使用教程.ipynb index f3823fa..e69de29 100644 --- a/OpenI云脑使用教程.ipynb +++ b/OpenI云脑使用教程.ipynb @@ -1,227 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "7d112f9b-84ba-420d-a52b-9eb7ba307068", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looking in indexes: http://pip.modelarts.private.com:8888/repository/pypi/simple\n", - "Requirement already satisfied: openi-test==0.7.1 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (0.7.1)\n", - "Requirement already satisfied: requests in /home/ma-user/modelarts-dev/modelarts-sdk (from openi-test==0.7.1) (2.28.2)\n", - "Requirement already satisfied: tqdm in /home/ma-user/modelarts-dev/modelarts-sdk (from openi-test==0.7.1) (4.64.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /home/ma-user/modelarts-dev/modelarts-sdk (from requests->openi-test==0.7.1) (3.3.2)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (from requests->openi-test==0.7.1) (1.26.12)\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/ma-user/modelarts-dev/modelarts-sdk (from requests->openi-test==0.7.1) (3.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (from requests->openi-test==0.7.1) (2022.6.15)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "pip install openi-test==0.7.1" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "02ad2e02-6533-4da0-98c3-c5f238d4d8f7", - "metadata": {}, - "outputs": [], - "source": [ - "#导入包\n", - "from openi.context import prepare, upload_openi" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "69880626-9320-46cd-ad29-8e5f7be09f32", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:Using MoXing-v2.1.0.5d9c87c8-5d9c87c8\n", - "INFO:root:Using OBS-Python-SDK-3.20.9.1\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🎉 Successfully Download s3:///urchincache/attachment/d/d/ddabdf57-a65a-496c-bef0-19d82b9043cd/MNISTData.zip to /home/ma-user/work/dataset/MNISTData.zip\n", - "🎉 Successfully Extracted /home/ma-user/work/dataset/MNISTData.zip\n", - "🎉 Successfully Deleted /home/ma-user/work/dataset/MNISTData.zip\n", - "🎉 Successfully Download s3:///urchincache/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf8/checkpoint_lenet-1_1875.zip to /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n", - "🎉 Successfully Extracted /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n", - "🎉 Successfully Deleted /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n", - "🎉 Successfully Download s3:///urchincache/aimodels/0/c/0cf4367b-5234-4967-a41f-f548d3f69fcf/ to /home/ma-user/work/pretrainmodel/MNIST_Example_model_zjdt\n", - "please set the output location to /home/ma-user/work/output\n" - ] - } - ], - "source": [ - "\n", - "#初始化导入数据集和预训练模型到容器内\n", - "openi_context = prepare()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c586f98f-bead-4dc9-a22f-173a672d456b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/ma-user/work/dataset\n" - ] - }, - { - "data": { - "text/plain": [ - "['checkpoint_lenet-1_1875', 'MNISTData']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#获取数据集路径,预训练模型路径,输出路径\n", - "dataset_path = openi_context.dataset_path\n", - "print(dataset_path)\n", - "\n", - "import os\n", - "os.listdir(dataset_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "7d6617f0-7b86-4b1b-a201-ecdc58db53a5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/ma-user/work/pretrainmodel\n" - ] - }, - { - "data": { - "text/plain": [ - "['MNIST_Example_model_zjdt']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pretrain_model_path = openi_context.pretrain_model_path\n", - "print(pretrain_model_path)\n", - "os.listdir(pretrain_model_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "6bc51211-5555-452e-9d83-adcfee1c4f79", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/ma-user/work/output\n" - ] - }, - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_path = openi_context.output_path\n", - "print(output_path)\n", - "os.listdir(output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "48b5da5d-a55f-4781-9056-b886d41779c7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "upload /home/ma-user/work/output to openi\n" - ] - }, - { - "data": { - "text/plain": [ - "'/home/ma-user/work/output'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载\n", - "upload_openi()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75e7ce04-594e-4e8f-8292-15241709eb5e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python-3.7.10", - "language": "python", - "name": "python-3.7.10" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/README.md b/README.md index fd89f54..5d0e4c8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ pip install -U c2net-beta ``` #导入包 -from c2net.context import prepare, upload_output +from c2net.context import prepare #初始化导入数据集和预训练模型到容器内 c2net_context = prepare() @@ -22,12 +22,12 @@ dataset_path = c2net_context.dataset_path pretrain_model_path = c2net_context.pretrain_model_path output_path = c2net_context.output_path -#回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载 -upload_output() +#必须将输出结果保存到c2net_context.output_path,才能回传结果到openi,并且训练任务才能回传,调试任务回传后暂时不支持下载 ``` ## 2. 手写数字识别示例 * GPU示例请参考[gpu_mnist_example](./gpu_mnist_example/README.md) +* GPGPU示例请参考[gpgpu_mnist_example](./gpgpu_mnist_example/README.md) * NPU示例请参考[npu_mnist_example](./npu_mnist_example/README.md) * GCU示例请参考[gcu_mnist_example](./gcu_mnist_example/README.md) diff --git a/gcu_mnist_example/README.md b/gcu_mnist_example/README.md index c741b59..ca723b4 100644 --- a/gcu_mnist_example/README.md +++ b/gcu_mnist_example/README.md @@ -1,53 +1,3 @@ -<<<<<<< HEAD -# 如何在启智平台上进行模型训练—GCU示例 - -## 1.启智集群和智算集群的GCU训练样例 - -###### 启智集群的示例代码: - -- 训练示例请参考示例中[train_gcu.py](./train_gcu.py)的代码注释 - -## 2. 在openi上获取数据集,预训练模型,输出路径 - -安装openi包 - -``` -pip install -U openi -``` - -使用openi包 - -``` -#导入包 -from openi.context import prepare, upload_openi - -#初始化导入数据集和预训练模型到容器内 -openi_context = prepare() - -#获取数据集路径,预训练模型路径,输出路径 -dataset_path = openi_context.dataset_path -pretrain_model_path = openi_context.pretrain_model_path -output_path = openi_context.output_path - -#回传结果到openi -upload_openi() -``` - -## 3.FAQ - -### 3.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi): - -主要使用的方法有以下几个: - -``` -prepare 准备数据集,模型,输出路径 -upload_openi 将训练镜像的输出结果拷贝回启智平台 -``` - -### 3.2 解决参数报错问题: - -请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`, `--multi_date_url`等参数报错问题 -======= # 如何在启智平台上进行模型训练 - GCU版本 - 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分。数据加载方式、模型定义逻辑大致同[手写数字识别GPU版本_PytorchExample](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GPU)项目: @@ -166,6 +116,5 @@ upload_openi 将训练镜像的输出结果拷贝回启智平台 目前训练任务的日志在代码中print输出,参考示例train_for_c2net.py代码相关print ### 4.2 训练结束后可以下载模型文件 ->>>>>>> origin/liuzx ## 对于示例代码有任何问题,欢迎在本项目中提issue。 diff --git a/gcu_mnist_example/model.py b/gcu_mnist_example/model.py index ae424a7..157bad6 100644 --- a/gcu_mnist_example/model.py +++ b/gcu_mnist_example/model.py @@ -32,4 +32,4 @@ class Model(Module): y = self.relu4(y) y = self.fc3(y) y = self.relu5(y) - return y + return y \ No newline at end of file diff --git a/gcu_mnist_example/train_gcu.py b/gcu_mnist_example/train.py similarity index 99% rename from gcu_mnist_example/train_gcu.py rename to gcu_mnist_example/train.py index b49cc5d..ee1867e 100644 --- a/gcu_mnist_example/train_gcu.py +++ b/gcu_mnist_example/train.py @@ -140,4 +140,4 @@ if __name__ == '__main__': #The model output location is placed under /tmp/output state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum)) - print(os.listdir('{}'.format(c2net_context.output_path))) + print(os.listdir('{}'.format(c2net_context.output_path))) \ No newline at end of file diff --git a/gpgpu_mnist_example/README.md b/gpgpu_mnist_example/README.md new file mode 100644 index 0000000..8369e3e --- /dev/null +++ b/gpgpu_mnist_example/README.md @@ -0,0 +1,53 @@ +# 如何在启智平台上进行模型训练—GPGPU示例 + +## 1.启智集群和智算集群的GPGPU训练样例 + +###### 启智集群的示例代码: + +- 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 + +## 2. 在云脑上获取数据集,预训练模型,输出路径 + +安装c2net包 + +``` +pip install -U c2net-beta +``` + +使用c2net包 + +``` +#导入包 +from c2net.context import prepare,upload_output + +#初始化导入数据集和预训练模型到容器内 +c2net_context = prepare() + +#获取数据集路径,预训练模型路径,输出路径 +dataset_path = c2net_context.dataset_path +pretrain_model_path = c2net_context.pretrain_model_path +output_path = openi_context.output_path + +#回传结果 +upload_output() + +``` + +## 3.[创建GPU训练示例任务界面教程](./Example_Picture/快速创建GPU训练任务.md) + +## 4.FAQ + +### 4.1 关于启智平台公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi): + +主要使用的方法有以下几个: + +``` +prepare 准备数据集,模型,输出路径 +upload_output 将训练镜像的输出结果拷贝回启智平台 +``` + +### 4.2 解决参数报错问题: + +请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`,`--data_url`, `--multi_date_url`等参数报错问题 + +## 对于示例代码有任何问题,欢迎在本项目中提issue。 diff --git a/gpgpu_mnist_example/inference.py b/gpgpu_mnist_example/inference.py new file mode 100644 index 0000000..6c78486 --- /dev/null +++ b/gpgpu_mnist_example/inference.py @@ -0,0 +1,82 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +1,The dataset structure of the single-dataset in this example + MnistDataset_torch.zip + ├── test + └── train + +''' +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os +#导入c2net包 +from c2net.context import prepare + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +# 参数声明 +WORKERS = 0 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型测试 +def test(model, test_loader, data_length): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + + # 结果写入输出文件夹 + filename = 'result.txt' + file_path = os.path.join('/tmp/output', filename) + with open(file_path, 'w') as file: + file.write('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, data_length, 100. * correct / data_length)) + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" + MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + test_dataset = mnist.MNIST(root=mnist_example_test2_model_djts_path + "/test", train=False, transform=ToTensor(),download=False) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + checkpoint = torch.load(mnist_example_test2_model_djts_path + "/mnist_epoch1_0.73.pkl") + model.load_state_dict(checkpoint['model']) + test(model,test_loader,len(test_dataset)) \ No newline at end of file diff --git a/gpgpu_mnist_example/model.py b/gpgpu_mnist_example/model.py new file mode 100644 index 0000000..ae424a7 --- /dev/null +++ b/gpgpu_mnist_example/model.py @@ -0,0 +1,35 @@ +from torch.nn import Module +from torch import nn + + +class Model(Module): + def __init__(self): + super(Model, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.relu1 = nn.ReLU() + self.pool1 = nn.MaxPool2d(2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.relu2 = nn.ReLU() + self.pool2 = nn.MaxPool2d(2) + self.fc1 = nn.Linear(256, 120) + self.relu3 = nn.ReLU() + self.fc2 = nn.Linear(120, 84) + self.relu4 = nn.ReLU() + self.fc3 = nn.Linear(84, 10) + self.relu5 = nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + y = self.pool1(y) + y = self.conv2(y) + y = self.relu2(y) + y = self.pool2(y) + y = y.view(y.shape[0], -1) + y = self.fc1(y) + y = self.relu3(y) + y = self.fc2(y) + y = self.relu4(y) + y = self.fc3(y) + y = self.relu5(y) + return y diff --git a/gpu_mnist_example/README.md b/gpu_mnist_example/README.md index a12cb78..a98ddf6 100644 --- a/gpu_mnist_example/README.md +++ b/gpu_mnist_example/README.md @@ -4,46 +4,47 @@ ###### 启智集群的示例代码: -- 训练示例请参考示例中[train_gpu.py](./train_gpu.py)的代码注释 +- 训练任务示例请参考示例中[train.py](./train.py)的代码注释 +- 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 -## 2. 在openi上获取数据集,预训练模型,输出路径 +## 2. 在云脑上获取数据集,预训练模型,输出路径 -安装openi包 +安装c2net包 ``` -pip install -U openi +pip install -U c2net-beta ``` -使用openi包 +使用c2net包 ``` #导入包 -from openi.context import prepare, upload_openi +from c2net.context import prepare,upload_output #初始化导入数据集和预训练模型到容器内 -openi_context = prepare() +c2net_context = prepare() #获取数据集路径,预训练模型路径,输出路径 -dataset_path = openi_context.dataset_path -pretrain_model_path = openi_context.pretrain_model_path +dataset_path = c2net_context.dataset_path +pretrain_model_path = c2net_context.pretrain_model_path output_path = openi_context.output_path -#回传结果到openi -upload_openi() +#回传结果 +upload_output() + ``` ## 3.[创建GPU训练示例任务界面教程](./Example_Picture/快速创建GPU训练任务.md) ## 4.FAQ -### 4.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi): +### 4.1 关于启智平台公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi): 主要使用的方法有以下几个: ``` prepare 准备数据集,模型,输出路径 -upload_openi 将训练镜像的输出结果拷贝回启智平台 - +upload_output 将训练镜像的输出结果拷贝回启智平台 ``` ### 4.2 解决参数报错问题: diff --git a/gpu_mnist_example/inference.py b/gpu_mnist_example/inference.py new file mode 100644 index 0000000..6c78486 --- /dev/null +++ b/gpu_mnist_example/inference.py @@ -0,0 +1,82 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +1,The dataset structure of the single-dataset in this example + MnistDataset_torch.zip + ├── test + └── train + +''' +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os +#导入c2net包 +from c2net.context import prepare + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +# 参数声明 +WORKERS = 0 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型测试 +def test(model, test_loader, data_length): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + + # 结果写入输出文件夹 + filename = 'result.txt' + file_path = os.path.join('/tmp/output', filename) + with open(file_path, 'w') as file: + file.write('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, data_length, 100. * correct / data_length)) + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" + MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + test_dataset = mnist.MNIST(root=mnist_example_test2_model_djts_path + "/test", train=False, transform=ToTensor(),download=False) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + checkpoint = torch.load(mnist_example_test2_model_djts_path + "/mnist_epoch1_0.73.pkl") + model.load_state_dict(checkpoint['model']) + test(model,test_loader,len(test_dataset)) \ No newline at end of file diff --git a/gpu_mnist_example/train_gpu.py b/gpu_mnist_example/train.py similarity index 95% rename from gpu_mnist_example/train_gpu.py rename to gpu_mnist_example/train.py index 88d20e1..493c336 100644 --- a/gpu_mnist_example/train_gpu.py +++ b/gpu_mnist_example/train.py @@ -11,9 +11,7 @@ If there are Chinese comments in the code,please add at the beginning: └── train ''' -import os -os.system("pip install openi-test") -os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) + from model import Model import numpy as np @@ -26,7 +24,7 @@ from torchvision.transforms import ToTensor import argparse import os #导入c2net包 -from c2net.context import prepare, upload_output +from c2net.context import prepare # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') @@ -114,7 +112,5 @@ if __name__ == '__main__': # 将模型保存到c2net_context.output_path state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) - #回传结果 - upload_output() diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md index 0dffdda..90b9ded 100644 --- a/npu_mnist_example/README.md +++ b/npu_mnist_example/README.md @@ -4,49 +4,44 @@ ###### 启智集群的示例代码: -- 训练示例请参考示例中[train_npu.py](./train_npu.py)的代码注释 +- 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释 +- 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释 +- 训练任务示例请参考示例中[inference.py](./inference.py)的代码注释 - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释 -- 继续训练功能示例代码请参考[train_continue.py](./train_continue.py) ,启智与智算的用法相同 -## 2. 在openi上获取数据集,预训练模型,输出路径 +## 2. 在云脑上获取数据集,预训练模型,输出路径 -安装openi包 - -``` -pip install -U openi -``` - -使用openi包 +使用c2net包 ``` #导入包 -from openi.context import prepare, upload_openi +from c2net.context import prepare, upload_openi #初始化导入数据集和预训练模型到容器内 -openi_context = prepare() +c2net_context = prepare() #获取数据集路径,预训练模型路径,输出路径 -dataset_path = openi_context.dataset_path -pretrain_model_path = openi_context.pretrain_model_path -output_path = openi_context.output_path +dataset_path = c2net_context.dataset_path +pretrain_model_path = c2net_context.pretrain_model_path +output_path = c2net_context.output_path #回传结果到openi -upload_openi() +upload_output() ``` ## 3.[创建NPU训练示例任务界面教程](./Example_Picture/快速创建NPU训练任务.md) ## 4.FAQ -### 4.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi): +### 4.1 关于公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi): 主要使用的方法有以下几个: ``` -prepare 准备数据集,模型,输出路径 -upload_openi 将训练镜像的输出结果拷贝回启智平台 -obs_copy_file 通过mox拷贝文件 -obs_copy_folder 通过mox拷贝文件夹 +prepare 准备数据集,模型,输出路径 +c2net.context.upload_output 将训练镜像的输出结果拷贝回启智平台 +c2net.context.moxing_helper.obs_copy_file 通过mox拷贝文件 +c2net.context.moxing_helper.obs_copy_folder 通过mox拷贝文件夹 ``` ### 4.2 解决参数报错问题: @@ -59,7 +54,7 @@ obs_copy_folder 通过mox拷贝文件夹 ``` if local_rank%8==0: - 这里省略下载数据的代码...(openi.context.prepare()) + 这里省略下载数据的代码... f = open("/cache/download_input.txt", 'w') f.close() try: diff --git a/npu_mnist_example/inference.py b/npu_mnist_example/inference.py new file mode 100644 index 0000000..96aff2f --- /dev/null +++ b/npu_mnist_example/inference.py @@ -0,0 +1,90 @@ + + +""" +示例选用的数据集是MNISTData.zip +数据集结构是: + MNISTData.zip + ├── test + │ ├── t10k-images-idx3-ubyte + │ └── t10k-labels-idx1-ubyte + └── train + ├── train-images-idx3-ubyte + └── train-labels-idx1-ubyte + +使用注意事项: +1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 +2、用户需要调用c2net的python sdk包 +""" +import time +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +import numpy as np +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore import Tensor +#导入c2net包 +from c2net.context import prepare, upload_output + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 + args, unknown = parser.parse_known_args() + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" + #获取预训练模型路径 + mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + #获取输出路径 + save_path = c2net_context.output_path + + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + repeat_size = cfg.epoch_size + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + #model = Model(network, net_loss, net_opt, metrics={"Accuracy"}) + model = Model(network, net_loss, net_opt) + + + print("============== Starting Testing ==============") + load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) + ds_test = create_dataset(os.path.join(mnistdata_path, "test"), batch_size=1).create_dict_iterator() + data = next(ds_test) + images = data["image"].asnumpy() + labels = data["label"].asnumpy() + print('Tensor:', Tensor(data['image'])) + output = model.predict(Tensor(data['image'])) + predicted = np.argmax(output.asnumpy(), axis=1) + pred = np.argmax(output.asnumpy(), axis=1) + print('predicted:', predicted) + print('pred:', pred) + + print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') + filename = 'result.txt' + file_path = os.path.join(save_path, filename) + with open(file_path, 'a+') as file: + file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) + + ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path + upload_output() \ No newline at end of file diff --git a/npu_mnist_example/read_imagenet.py b/npu_mnist_example/read_imagenet.py index b54f926..eba4f24 100644 --- a/npu_mnist_example/read_imagenet.py +++ b/npu_mnist_example/read_imagenet.py @@ -36,7 +36,7 @@ import moxing as mox import mindspore as ms from mindspore.dataset import ImageFolderDataset import mindspore.dataset.vision.c_transforms as transforms -from openi.context import upload_openi +from c2net.context import upload_output parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example') parser.add_argument('--train_url', @@ -45,6 +45,7 @@ parser.add_argument('--train_url', if __name__ == "__main__": args, unknown = parser.parse_known_args() + #注意只有训练任务可用 data_path = '/cache/sfs/data/imagenet/' modelart_output = '/cache/output' if not os.path.exists(modelart_output): @@ -69,4 +70,4 @@ if __name__ == "__main__": data_info = dataset_train.to_json(filename= modelart_output + '/data_info.json') print(data_info) - upload_openi() \ No newline at end of file + upload_output() \ No newline at end of file diff --git a/npu_mnist_example/train_npu.py b/npu_mnist_example/train.py similarity index 94% rename from npu_mnist_example/train_npu.py rename to npu_mnist_example/train.py index 5546bc3..1316dc9 100644 --- a/npu_mnist_example/train_npu.py +++ b/npu_mnist_example/train.py @@ -15,7 +15,9 @@ 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 """ + import os +os.system("pip install c2net-beta -i https://pypi.tuna.tsinghua.edu.cn/simple") import argparse from config import mnist_cfg as cfg from dataset import create_dataset @@ -29,6 +31,7 @@ import time #导入c2net包 from c2net.context import prepare, upload_output + parser = argparse.ArgumentParser(description='MindSpore Lenet Example') parser.add_argument( @@ -52,6 +55,8 @@ if __name__ == "__main__": mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" #获取预训练模型路径 mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + #获取输出路径 + output_path = c2net_context.output_path context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) #使用数据集的方式 @@ -77,7 +82,7 @@ if __name__ == "__main__": save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) #将模型保存到c2net_context.output_path - outputDirectory = c2net_context.output_path + "/" + outputDirectory = output_path + "/" ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=outputDirectory, config=config_ck) diff --git a/npu_mnist_example/train_continue.py b/npu_mnist_example/train_continue.py deleted file mode 100644 index 47b2dc2..0000000 --- a/npu_mnist_example/train_continue.py +++ /dev/null @@ -1,122 +0,0 @@ -##################################################################################################### -# 继续训练功能:修改训练任务时,若勾选复用上次结果,则可在新训练任务的输出路径中读取到上次结果 -# -# 示例用法 -# - 增加两个训练参数 -# 'ckpt_save_name' 此次任务的输出文件名,用于保存此次训练的模型文件名称(不带后缀) -# 'ckpt_load_name' 上一次任务的输出文件名,用于加载上一次输出的模型文件名称(不带后缀),首次训练默认为空,则不读取任何文件 -# - 训练代码中判断 'ckpt_load_name' 是否为空,若不为空,则为继续训练任务 -##################################################################################################### - - -import os -import argparse -from config import mnist_cfg as cfg -from dataset import create_dataset -from dataset_distributed import create_dataset_parallel -from lenet import LeNet5 -import mindspore.nn as nn -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor -from mindspore import load_checkpoint, load_param_into_net -from mindspore.train import Model -from mindspore.nn.metrics import Accuracy -from mindspore.communication.management import get_rank - -#导入openi包 -from openi.context import prepare, upload_openi -from openi.context.helper import obs_copy_file, obs_copy_folder - -parser = argparse.ArgumentParser(description='MindSpore Lenet Example') - -parser.add_argument( - '--device_target', - type=str, - default="Ascend", - choices=['Ascend', 'CPU'], - help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') - -parser.add_argument('--epoch_size', - type=int, - default=5, - help='Training epochs.') - -### continue task parameters -parser.add_argument('--ckpt_load_name', - help='model name to save/load', - default= '') - -parser.add_argument('--ckpt_save_name', - help='model name to save/load', - default= 'checkpoint') - - -if __name__ == "__main__": - args, unknown = parser.parse_known_args() - - ###Initialize and copy data to training image - openi_context = prepare() - data_dir = openi_context.dataset_path - pretrain_model_dir = openi_context.pretrain_model_path - train_dir = openi_context.output_path - - device_num = int(os.getenv('RANK_SIZE')) - ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) - if ds_train.get_dataset_size() == 0: - raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") - - network = LeNet5(cfg.num_classes) - net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") - net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) - time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) - - ### 继续训练模型加载 - if args.ckpt_load_name: - obs_copy_folder(args.train_url, base_path) - load_path = "{}/{}.ckpt".format(base_path,args.ckpt_load_name) - param_dict = load_checkpoint(load_path) - load_param_into_net(network, param_dict) - print("Successfully load ckpt file:{}, saved_net_work:{}".format(load_path,param_dict)) - ### 保存已有模型名避免重复回传结果 - outputFiles = os.listdir(base_path) - - if args.device_target != "Ascend": - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}) - else: - model = Model(network, - net_loss, - net_opt, - metrics={"accuracy": Accuracy()}, - amp_level="O2") - - config_ck = CheckpointConfig( - save_checkpoint_steps=cfg.save_checkpoint_steps, - keep_checkpoint_max=cfg.keep_checkpoint_max) - #Note that this method saves the model file on each card. You need to specify the save path on each card. - # In this example, get_rank() is added to distinguish different paths. - if device_num == 1: - save_path = base_path + "/" - if device_num > 1: - save_path = base_path + "/" + str(get_rank()) + "/" - ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name, - directory=save_path, - config=config_ck) - print("============== Starting Training ==============") - epoch_size = cfg['epoch_size'] - if (args.epoch_size): - epoch_size = args.epoch_size - print('epoch_size is: ', epoch_size) - model.train(epoch_size, - ds_train, - callbacks=[time_cb, ckpoint_cb, - LossMonitor()]) - - ### 将训练容器中的新输出模型 回传到启智社区 - outputFilesNew = os.listdir(base_path) - new_models = [i for i in outputFilesNew if i not in outputFiles] - for n in new_models: - ckpt_url = base_path + "/" + n - obs_ckpt_url = args.train_url + "/" + n - obs_copy_file(ckpt_url, obs_ckpt_url) \ No newline at end of file diff --git a/npu_mnist_example/train_npu_multi_card.py b/npu_mnist_example/train_multi_card.py similarity index 96% rename from npu_mnist_example/train_npu_multi_card.py rename to npu_mnist_example/train_multi_card.py index 3b6df62..1b345fb 100644 --- a/npu_mnist_example/train_npu_multi_card.py +++ b/npu_mnist_example/train_multi_card.py @@ -30,7 +30,7 @@ from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank import time #导入openi包 -from c2net.context import prepare, upload_output +from c2net.context import prepare parser = argparse.ArgumentParser(description='MindSpore Lenet Example') @@ -67,6 +67,7 @@ if __name__ == "__main__": mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" #获取预训练模型路径 mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" + output_path = c2net_context.output_path #Set a cache file to determine whether the data has been copied to obs. #If this file exists during multi-card training, there is no need to copy the dataset multiple times. f = open("/cache/download_input.txt", 'w') @@ -102,7 +103,7 @@ if __name__ == "__main__": keep_checkpoint_max=cfg.keep_checkpoint_max) #Note that this method saves the model file on each card. You need to specify the save path on each card. # In this example, get_rank() is added to distinguish different paths. - outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/" + outputDirectory = output_path + "/" + str(get_rank()) + "/" ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=outputDirectory, config=config_ck) diff --git a/train.py b/train.py deleted file mode 100644 index d114c93..0000000 --- a/train.py +++ /dev/null @@ -1,15 +0,0 @@ - -#导入包 -import os -from c2net.context import prepare, upload_output - -#初始化导入数据集和预训练模型到容器内 -c2net_context = prepare() - -#获取数据集路径,预训练模型路径,输出路径 -dataset_path = c2net_context.dataset_path -pretrain_model_path = c2net_context.pretrain_model_path -output_path = c2net_context.output_path - -#回传结果到openi -upload_output() \ No newline at end of file