| @@ -1,227 +0,0 @@ | |||
| { | |||
| "cells": [ | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 1, | |||
| "id": "7d112f9b-84ba-420d-a52b-9eb7ba307068", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "Looking in indexes: http://pip.modelarts.private.com:8888/repository/pypi/simple\n", | |||
| "Requirement already satisfied: openi-test==0.7.1 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (0.7.1)\n", | |||
| "Requirement already satisfied: requests in /home/ma-user/modelarts-dev/modelarts-sdk (from openi-test==0.7.1) (2.28.2)\n", | |||
| "Requirement already satisfied: tqdm in /home/ma-user/modelarts-dev/modelarts-sdk (from openi-test==0.7.1) (4.64.0)\n", | |||
| "Requirement already satisfied: charset-normalizer<4,>=2 in /home/ma-user/modelarts-dev/modelarts-sdk (from requests->openi-test==0.7.1) (3.3.2)\n", | |||
| "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (from requests->openi-test==0.7.1) (1.26.12)\n", | |||
| "Requirement already satisfied: idna<4,>=2.5 in /home/ma-user/modelarts-dev/modelarts-sdk (from requests->openi-test==0.7.1) (3.4)\n", | |||
| "Requirement already satisfied: certifi>=2017.4.17 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (from requests->openi-test==0.7.1) (2022.6.15)\n", | |||
| "Note: you may need to restart the kernel to use updated packages.\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "pip install openi-test==0.7.1" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 2, | |||
| "id": "02ad2e02-6533-4da0-98c3-c5f238d4d8f7", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [ | |||
| "#导入包\n", | |||
| "from openi.context import prepare, upload_openi" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 3, | |||
| "id": "69880626-9320-46cd-ad29-8e5f7be09f32", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stderr", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "INFO:root:Using MoXing-v2.1.0.5d9c87c8-5d9c87c8\n", | |||
| "INFO:root:Using OBS-Python-SDK-3.20.9.1\n" | |||
| ] | |||
| }, | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "🎉 Successfully Download s3:///urchincache/attachment/d/d/ddabdf57-a65a-496c-bef0-19d82b9043cd/MNISTData.zip to /home/ma-user/work/dataset/MNISTData.zip\n", | |||
| "🎉 Successfully Extracted /home/ma-user/work/dataset/MNISTData.zip\n", | |||
| "🎉 Successfully Deleted /home/ma-user/work/dataset/MNISTData.zip\n", | |||
| "🎉 Successfully Download s3:///urchincache/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf8/checkpoint_lenet-1_1875.zip to /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n", | |||
| "🎉 Successfully Extracted /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n", | |||
| "🎉 Successfully Deleted /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n", | |||
| "🎉 Successfully Download s3:///urchincache/aimodels/0/c/0cf4367b-5234-4967-a41f-f548d3f69fcf/ to /home/ma-user/work/pretrainmodel/MNIST_Example_model_zjdt\n", | |||
| "please set the output location to /home/ma-user/work/output\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "\n", | |||
| "#初始化导入数据集和预训练模型到容器内\n", | |||
| "openi_context = prepare()" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 15, | |||
| "id": "c586f98f-bead-4dc9-a22f-173a672d456b", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "/home/ma-user/work/dataset\n" | |||
| ] | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "['checkpoint_lenet-1_1875', 'MNISTData']" | |||
| ] | |||
| }, | |||
| "execution_count": 15, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "#获取数据集路径,预训练模型路径,输出路径\n", | |||
| "dataset_path = openi_context.dataset_path\n", | |||
| "print(dataset_path)\n", | |||
| "\n", | |||
| "import os\n", | |||
| "os.listdir(dataset_path)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 16, | |||
| "id": "7d6617f0-7b86-4b1b-a201-ecdc58db53a5", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "/home/ma-user/work/pretrainmodel\n" | |||
| ] | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "['MNIST_Example_model_zjdt']" | |||
| ] | |||
| }, | |||
| "execution_count": 16, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "pretrain_model_path = openi_context.pretrain_model_path\n", | |||
| "print(pretrain_model_path)\n", | |||
| "os.listdir(pretrain_model_path)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 17, | |||
| "id": "6bc51211-5555-452e-9d83-adcfee1c4f79", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "/home/ma-user/work/output\n" | |||
| ] | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "[]" | |||
| ] | |||
| }, | |||
| "execution_count": 17, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "output_path = openi_context.output_path\n", | |||
| "print(output_path)\n", | |||
| "os.listdir(output_path)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 9, | |||
| "id": "48b5da5d-a55f-4781-9056-b886d41779c7", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "upload /home/ma-user/work/output to openi\n" | |||
| ] | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "'/home/ma-user/work/output'" | |||
| ] | |||
| }, | |||
| "execution_count": 9, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "#回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载\n", | |||
| "upload_openi()" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "id": "75e7ce04-594e-4e8f-8292-15241709eb5e", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| } | |||
| ], | |||
| "metadata": { | |||
| "kernelspec": { | |||
| "display_name": "python-3.7.10", | |||
| "language": "python", | |||
| "name": "python-3.7.10" | |||
| }, | |||
| "language_info": { | |||
| "codemirror_mode": { | |||
| "name": "ipython", | |||
| "version": 3 | |||
| }, | |||
| "file_extension": ".py", | |||
| "mimetype": "text/x-python", | |||
| "name": "python", | |||
| "nbconvert_exporter": "python", | |||
| "pygments_lexer": "ipython3", | |||
| "version": "3.7.10" | |||
| } | |||
| }, | |||
| "nbformat": 4, | |||
| "nbformat_minor": 5 | |||
| } | |||
| @@ -12,7 +12,7 @@ pip install -U c2net-beta | |||
| ``` | |||
| #导入包 | |||
| from c2net.context import prepare, upload_output | |||
| from c2net.context import prepare | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| @@ -22,12 +22,12 @@ dataset_path = c2net_context.dataset_path | |||
| pretrain_model_path = c2net_context.pretrain_model_path | |||
| output_path = c2net_context.output_path | |||
| #回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载 | |||
| upload_output() | |||
| #必须将输出结果保存到c2net_context.output_path,才能回传结果到openi,并且训练任务才能回传,调试任务回传后暂时不支持下载 | |||
| ``` | |||
| ## 2. 手写数字识别示例 | |||
| * GPU示例请参考[gpu_mnist_example](./gpu_mnist_example/README.md) | |||
| * GPGPU示例请参考[gpgpu_mnist_example](./gpgpu_mnist_example/README.md) | |||
| * NPU示例请参考[npu_mnist_example](./npu_mnist_example/README.md) | |||
| * GCU示例请参考[gcu_mnist_example](./gcu_mnist_example/README.md) | |||
| @@ -1,53 +1,3 @@ | |||
| <<<<<<< HEAD | |||
| # 如何在启智平台上进行模型训练—GCU示例 | |||
| ## 1.启智集群和智算集群的GCU训练样例 | |||
| ###### 启智集群的示例代码: | |||
| - 训练示例请参考示例中[train_gcu.py](./train_gcu.py)的代码注释 | |||
| ## 2. 在openi上获取数据集,预训练模型,输出路径 | |||
| 安装openi包 | |||
| ``` | |||
| pip install -U openi | |||
| ``` | |||
| 使用openi包 | |||
| ``` | |||
| #导入包 | |||
| from openi.context import prepare, upload_openi | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| #回传结果到openi | |||
| upload_openi() | |||
| ``` | |||
| ## 3.FAQ | |||
| ### 3.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi): | |||
| 主要使用的方法有以下几个: | |||
| ``` | |||
| prepare 准备数据集,模型,输出路径 | |||
| upload_openi 将训练镜像的输出结果拷贝回启智平台 | |||
| ``` | |||
| ### 3.2 解决参数报错问题: | |||
| 请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`, `--multi_date_url`等参数报错问题 | |||
| ======= | |||
| # 如何在启智平台上进行模型训练 - GCU版本 | |||
| - 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分。数据加载方式、模型定义逻辑大致同[手写数字识别GPU版本_PytorchExample](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GPU)项目: | |||
| @@ -166,6 +116,5 @@ upload_openi 将训练镜像的输出结果拷贝回启智平台 | |||
| 目前训练任务的日志在代码中print输出,参考示例train_for_c2net.py代码相关print | |||
| ### 4.2 训练结束后可以下载模型文件 | |||
| >>>>>>> origin/liuzx | |||
| ## 对于示例代码有任何问题,欢迎在本项目中提issue。 | |||
| @@ -32,4 +32,4 @@ class Model(Module): | |||
| y = self.relu4(y) | |||
| y = self.fc3(y) | |||
| y = self.relu5(y) | |||
| return y | |||
| return y | |||
| @@ -140,4 +140,4 @@ if __name__ == '__main__': | |||
| #The model output location is placed under /tmp/output | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1} | |||
| torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum)) | |||
| print(os.listdir('{}'.format(c2net_context.output_path))) | |||
| print(os.listdir('{}'.format(c2net_context.output_path))) | |||
| @@ -0,0 +1,53 @@ | |||
| # 如何在启智平台上进行模型训练—GPGPU示例 | |||
| ## 1.启智集群和智算集群的GPGPU训练样例 | |||
| ###### 启智集群的示例代码: | |||
| - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 | |||
| ## 2. 在云脑上获取数据集,预训练模型,输出路径 | |||
| 安装c2net包 | |||
| ``` | |||
| pip install -U c2net-beta | |||
| ``` | |||
| 使用c2net包 | |||
| ``` | |||
| #导入包 | |||
| from c2net.context import prepare,upload_output | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = c2net_context.dataset_path | |||
| pretrain_model_path = c2net_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| #回传结果 | |||
| upload_output() | |||
| ``` | |||
| ## 3.[创建GPU训练示例任务界面教程](./Example_Picture/快速创建GPU训练任务.md) | |||
| ## 4.FAQ | |||
| ### 4.1 关于启智平台公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi): | |||
| 主要使用的方法有以下几个: | |||
| ``` | |||
| prepare 准备数据集,模型,输出路径 | |||
| upload_output 将训练镜像的输出结果拷贝回启智平台 | |||
| ``` | |||
| ### 4.2 解决参数报错问题: | |||
| 请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`,`--data_url`, `--multi_date_url`等参数报错问题 | |||
| ## 对于示例代码有任何问题,欢迎在本项目中提issue。 | |||
| @@ -0,0 +1,82 @@ | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| ''' | |||
| If there are Chinese comments in the code,please add at the beginning: | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| 1,The dataset structure of the single-dataset in this example | |||
| MnistDataset_torch.zip | |||
| ├── test | |||
| └── train | |||
| ''' | |||
| from model import Model | |||
| import numpy as np | |||
| import torch | |||
| from torchvision.datasets import mnist | |||
| from torch.nn import CrossEntropyLoss | |||
| from torch.optim import SGD | |||
| from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| import os | |||
| #导入c2net包 | |||
| from c2net.context import prepare | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') | |||
| parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') | |||
| # 参数声明 | |||
| WORKERS = 0 | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| model = Model().to(device) | |||
| optimizer = SGD(model.parameters(), lr=1e-1) | |||
| cost = CrossEntropyLoss() | |||
| # 模型测试 | |||
| def test(model, test_loader, data_length): | |||
| model.eval() | |||
| test_loss = 0 | |||
| correct = 0 | |||
| with torch.no_grad(): | |||
| for i, data in enumerate(test_loader, 0): | |||
| x, y = data | |||
| x = x.to(device) | |||
| y = y.to(device) | |||
| y_hat = model(x) | |||
| test_loss += cost(y_hat, y).item() | |||
| pred = y_hat.max(1, keepdim=True)[1] | |||
| correct += pred.eq(y.view_as(pred)).sum().item() | |||
| test_loss /= (i+1) | |||
| # 结果写入输出文件夹 | |||
| filename = 'result.txt' | |||
| file_path = os.path.join('/tmp/output', filename) | |||
| with open(file_path, 'w') as file: | |||
| file.write('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |||
| test_loss, correct, data_length, 100. * correct / data_length)) | |||
| if __name__ == '__main__': | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" | |||
| MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #log output | |||
| print('cuda is available:{}'.format(torch.cuda.is_available())) | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| batch_size = args.batch_size | |||
| epochs = args.epoch_size | |||
| test_dataset = mnist.MNIST(root=mnist_example_test2_model_djts_path + "/test", train=False, transform=ToTensor(),download=False) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| model = Model().to(device) | |||
| checkpoint = torch.load(mnist_example_test2_model_djts_path + "/mnist_epoch1_0.73.pkl") | |||
| model.load_state_dict(checkpoint['model']) | |||
| test(model,test_loader,len(test_dataset)) | |||
| @@ -0,0 +1,35 @@ | |||
| from torch.nn import Module | |||
| from torch import nn | |||
| class Model(Module): | |||
| def __init__(self): | |||
| super(Model, self).__init__() | |||
| self.conv1 = nn.Conv2d(1, 6, 5) | |||
| self.relu1 = nn.ReLU() | |||
| self.pool1 = nn.MaxPool2d(2) | |||
| self.conv2 = nn.Conv2d(6, 16, 5) | |||
| self.relu2 = nn.ReLU() | |||
| self.pool2 = nn.MaxPool2d(2) | |||
| self.fc1 = nn.Linear(256, 120) | |||
| self.relu3 = nn.ReLU() | |||
| self.fc2 = nn.Linear(120, 84) | |||
| self.relu4 = nn.ReLU() | |||
| self.fc3 = nn.Linear(84, 10) | |||
| self.relu5 = nn.ReLU() | |||
| def forward(self, x): | |||
| y = self.conv1(x) | |||
| y = self.relu1(y) | |||
| y = self.pool1(y) | |||
| y = self.conv2(y) | |||
| y = self.relu2(y) | |||
| y = self.pool2(y) | |||
| y = y.view(y.shape[0], -1) | |||
| y = self.fc1(y) | |||
| y = self.relu3(y) | |||
| y = self.fc2(y) | |||
| y = self.relu4(y) | |||
| y = self.fc3(y) | |||
| y = self.relu5(y) | |||
| return y | |||
| @@ -4,46 +4,47 @@ | |||
| ###### 启智集群的示例代码: | |||
| - 训练示例请参考示例中[train_gpu.py](./train_gpu.py)的代码注释 | |||
| - 训练任务示例请参考示例中[train.py](./train.py)的代码注释 | |||
| - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释 | |||
| ## 2. 在openi上获取数据集,预训练模型,输出路径 | |||
| ## 2. 在云脑上获取数据集,预训练模型,输出路径 | |||
| 安装openi包 | |||
| 安装c2net包 | |||
| ``` | |||
| pip install -U openi | |||
| pip install -U c2net-beta | |||
| ``` | |||
| 使用openi包 | |||
| 使用c2net包 | |||
| ``` | |||
| #导入包 | |||
| from openi.context import prepare, upload_openi | |||
| from c2net.context import prepare,upload_output | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| c2net_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| dataset_path = c2net_context.dataset_path | |||
| pretrain_model_path = c2net_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| #回传结果到openi | |||
| upload_openi() | |||
| #回传结果 | |||
| upload_output() | |||
| ``` | |||
| ## 3.[创建GPU训练示例任务界面教程](./Example_Picture/快速创建GPU训练任务.md) | |||
| ## 4.FAQ | |||
| ### 4.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi): | |||
| ### 4.1 关于启智平台公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi): | |||
| 主要使用的方法有以下几个: | |||
| ``` | |||
| prepare 准备数据集,模型,输出路径 | |||
| upload_openi 将训练镜像的输出结果拷贝回启智平台 | |||
| upload_output 将训练镜像的输出结果拷贝回启智平台 | |||
| ``` | |||
| ### 4.2 解决参数报错问题: | |||
| @@ -0,0 +1,82 @@ | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| ''' | |||
| If there are Chinese comments in the code,please add at the beginning: | |||
| #!/usr/bin/python | |||
| #coding=utf-8 | |||
| 1,The dataset structure of the single-dataset in this example | |||
| MnistDataset_torch.zip | |||
| ├── test | |||
| └── train | |||
| ''' | |||
| from model import Model | |||
| import numpy as np | |||
| import torch | |||
| from torchvision.datasets import mnist | |||
| from torch.nn import CrossEntropyLoss | |||
| from torch.optim import SGD | |||
| from torch.utils.data import DataLoader | |||
| from torchvision.transforms import ToTensor | |||
| import argparse | |||
| import os | |||
| #导入c2net包 | |||
| from c2net.context import prepare | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') | |||
| parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') | |||
| # 参数声明 | |||
| WORKERS = 0 | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| model = Model().to(device) | |||
| optimizer = SGD(model.parameters(), lr=1e-1) | |||
| cost = CrossEntropyLoss() | |||
| # 模型测试 | |||
| def test(model, test_loader, data_length): | |||
| model.eval() | |||
| test_loss = 0 | |||
| correct = 0 | |||
| with torch.no_grad(): | |||
| for i, data in enumerate(test_loader, 0): | |||
| x, y = data | |||
| x = x.to(device) | |||
| y = y.to(device) | |||
| y_hat = model(x) | |||
| test_loss += cost(y_hat, y).item() | |||
| pred = y_hat.max(1, keepdim=True)[1] | |||
| correct += pred.eq(y.view_as(pred)).sum().item() | |||
| test_loss /= (i+1) | |||
| # 结果写入输出文件夹 | |||
| filename = 'result.txt' | |||
| file_path = os.path.join('/tmp/output', filename) | |||
| with open(file_path, 'w') as file: | |||
| file.write('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( | |||
| test_loss, correct, data_length, 100. * correct / data_length)) | |||
| if __name__ == '__main__': | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875" | |||
| MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #log output | |||
| print('cuda is available:{}'.format(torch.cuda.is_available())) | |||
| device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |||
| batch_size = args.batch_size | |||
| epochs = args.epoch_size | |||
| test_dataset = mnist.MNIST(root=mnist_example_test2_model_djts_path + "/test", train=False, transform=ToTensor(),download=False) | |||
| test_loader = DataLoader(test_dataset, batch_size=batch_size) | |||
| model = Model().to(device) | |||
| checkpoint = torch.load(mnist_example_test2_model_djts_path + "/mnist_epoch1_0.73.pkl") | |||
| model.load_state_dict(checkpoint['model']) | |||
| test(model,test_loader,len(test_dataset)) | |||
| @@ -11,9 +11,7 @@ If there are Chinese comments in the code,please add at the beginning: | |||
| └── train | |||
| ''' | |||
| import os | |||
| os.system("pip install openi-test") | |||
| os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH"))) | |||
| from model import Model | |||
| import numpy as np | |||
| @@ -26,7 +24,7 @@ from torchvision.transforms import ToTensor | |||
| import argparse | |||
| import os | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| from c2net.context import prepare | |||
| # Training settings | |||
| parser = argparse.ArgumentParser(description='PyTorch MNIST Example') | |||
| @@ -114,7 +112,5 @@ if __name__ == '__main__': | |||
| # 将模型保存到c2net_context.output_path | |||
| state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} | |||
| torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) | |||
| #回传结果 | |||
| upload_output() | |||
| @@ -4,49 +4,44 @@ | |||
| ###### 启智集群的示例代码: | |||
| - 训练示例请参考示例中[train_npu.py](./train_npu.py)的代码注释 | |||
| - 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释 | |||
| - 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释 | |||
| - 训练任务示例请参考示例中[inference.py](./inference.py)的代码注释 | |||
| - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释 | |||
| - 继续训练功能示例代码请参考[train_continue.py](./train_continue.py) ,启智与智算的用法相同 | |||
| ## 2. 在openi上获取数据集,预训练模型,输出路径 | |||
| ## 2. 在云脑上获取数据集,预训练模型,输出路径 | |||
| 安装openi包 | |||
| ``` | |||
| pip install -U openi | |||
| ``` | |||
| 使用openi包 | |||
| 使用c2net包 | |||
| ``` | |||
| #导入包 | |||
| from openi.context import prepare, upload_openi | |||
| from c2net.context import prepare, upload_openi | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| openi_context = prepare() | |||
| c2net_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = openi_context.dataset_path | |||
| pretrain_model_path = openi_context.pretrain_model_path | |||
| output_path = openi_context.output_path | |||
| dataset_path = c2net_context.dataset_path | |||
| pretrain_model_path = c2net_context.pretrain_model_path | |||
| output_path = c2net_context.output_path | |||
| #回传结果到openi | |||
| upload_openi() | |||
| upload_output() | |||
| ``` | |||
| ## 3.[创建NPU训练示例任务界面教程](./Example_Picture/快速创建NPU训练任务.md) | |||
| ## 4.FAQ | |||
| ### 4.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi): | |||
| ### 4.1 关于公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi): | |||
| 主要使用的方法有以下几个: | |||
| ``` | |||
| prepare 准备数据集,模型,输出路径 | |||
| upload_openi 将训练镜像的输出结果拷贝回启智平台 | |||
| obs_copy_file 通过mox拷贝文件 | |||
| obs_copy_folder 通过mox拷贝文件夹 | |||
| prepare 准备数据集,模型,输出路径 | |||
| c2net.context.upload_output 将训练镜像的输出结果拷贝回启智平台 | |||
| c2net.context.moxing_helper.obs_copy_file 通过mox拷贝文件 | |||
| c2net.context.moxing_helper.obs_copy_folder 通过mox拷贝文件夹 | |||
| ``` | |||
| ### 4.2 解决参数报错问题: | |||
| @@ -59,7 +54,7 @@ obs_copy_folder 通过mox拷贝文件夹 | |||
| ``` | |||
| if local_rank%8==0: | |||
| 这里省略下载数据的代码...(openi.context.prepare()) | |||
| 这里省略下载数据的代码... | |||
| f = open("/cache/download_input.txt", 'w') | |||
| f.close() | |||
| try: | |||
| @@ -0,0 +1,90 @@ | |||
| """ | |||
| 示例选用的数据集是MNISTData.zip | |||
| 数据集结构是: | |||
| MNISTData.zip | |||
| ├── test | |||
| │ ├── t10k-images-idx3-ubyte | |||
| │ └── t10k-labels-idx1-ubyte | |||
| └── train | |||
| ├── train-images-idx3-ubyte | |||
| └── train-labels-idx1-ubyte | |||
| 使用注意事项: | |||
| 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| 2、用户需要调用c2net的python sdk包 | |||
| """ | |||
| import time | |||
| import os | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| import numpy as np | |||
| from mindspore import context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore import Tensor | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| if __name__ == "__main__": | |||
| ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| args, unknown = parser.parse_known_args() | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径 | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #获取输出路径 | |||
| save_path = c2net_context.output_path | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| repeat_size = cfg.epoch_size | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| #model = Model(network, net_loss, net_opt, metrics={"Accuracy"}) | |||
| model = Model(network, net_loss, net_opt) | |||
| print("============== Starting Testing ==============") | |||
| load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt"))) | |||
| ds_test = create_dataset(os.path.join(mnistdata_path, "test"), batch_size=1).create_dict_iterator() | |||
| data = next(ds_test) | |||
| images = data["image"].asnumpy() | |||
| labels = data["label"].asnumpy() | |||
| print('Tensor:', Tensor(data['image'])) | |||
| output = model.predict(Tensor(data['image'])) | |||
| predicted = np.argmax(output.asnumpy(), axis=1) | |||
| pred = np.argmax(output.asnumpy(), axis=1) | |||
| print('predicted:', predicted) | |||
| print('pred:', pred) | |||
| print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') | |||
| filename = 'result.txt' | |||
| file_path = os.path.join(save_path, filename) | |||
| with open(file_path, 'a+') as file: | |||
| file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) | |||
| ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path | |||
| upload_output() | |||
| @@ -36,7 +36,7 @@ import moxing as mox | |||
| import mindspore as ms | |||
| from mindspore.dataset import ImageFolderDataset | |||
| import mindspore.dataset.vision.c_transforms as transforms | |||
| from openi.context import upload_openi | |||
| from c2net.context import upload_output | |||
| parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example') | |||
| parser.add_argument('--train_url', | |||
| @@ -45,6 +45,7 @@ parser.add_argument('--train_url', | |||
| if __name__ == "__main__": | |||
| args, unknown = parser.parse_known_args() | |||
| #注意只有训练任务可用 | |||
| data_path = '/cache/sfs/data/imagenet/' | |||
| modelart_output = '/cache/output' | |||
| if not os.path.exists(modelart_output): | |||
| @@ -69,4 +70,4 @@ if __name__ == "__main__": | |||
| data_info = dataset_train.to_json(filename= modelart_output + '/data_info.json') | |||
| print(data_info) | |||
| upload_openi() | |||
| upload_output() | |||
| @@ -15,7 +15,9 @@ | |||
| 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 | |||
| 2、用户需要调用c2net的python sdk包 | |||
| """ | |||
| import os | |||
| os.system("pip install c2net-beta -i https://pypi.tuna.tsinghua.edu.cn/simple") | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| @@ -29,6 +31,7 @@ import time | |||
| #导入c2net包 | |||
| from c2net.context import prepare, upload_output | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument( | |||
| @@ -52,6 +55,8 @@ if __name__ == "__main__": | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| #获取输出路径 | |||
| output_path = c2net_context.output_path | |||
| context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) | |||
| #使用数据集的方式 | |||
| @@ -77,7 +82,7 @@ if __name__ == "__main__": | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #将模型保存到c2net_context.output_path | |||
| outputDirectory = c2net_context.output_path + "/" | |||
| outputDirectory = output_path + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=outputDirectory, | |||
| config=config_ck) | |||
| @@ -1,122 +0,0 @@ | |||
| ##################################################################################################### | |||
| # 继续训练功能:修改训练任务时,若勾选复用上次结果,则可在新训练任务的输出路径中读取到上次结果 | |||
| # | |||
| # 示例用法 | |||
| # - 增加两个训练参数 | |||
| # 'ckpt_save_name' 此次任务的输出文件名,用于保存此次训练的模型文件名称(不带后缀) | |||
| # 'ckpt_load_name' 上一次任务的输出文件名,用于加载上一次输出的模型文件名称(不带后缀),首次训练默认为空,则不读取任何文件 | |||
| # - 训练代码中判断 'ckpt_load_name' 是否为空,若不为空,则为继续训练任务 | |||
| ##################################################################################################### | |||
| import os | |||
| import argparse | |||
| from config import mnist_cfg as cfg | |||
| from dataset import create_dataset | |||
| from dataset_distributed import create_dataset_parallel | |||
| from lenet import LeNet5 | |||
| import mindspore.nn as nn | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
| from mindspore import load_checkpoint, load_param_into_net | |||
| from mindspore.train import Model | |||
| from mindspore.nn.metrics import Accuracy | |||
| from mindspore.communication.management import get_rank | |||
| #导入openi包 | |||
| from openi.context import prepare, upload_openi | |||
| from openi.context.helper import obs_copy_file, obs_copy_folder | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| parser.add_argument( | |||
| '--device_target', | |||
| type=str, | |||
| default="Ascend", | |||
| choices=['Ascend', 'CPU'], | |||
| help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') | |||
| parser.add_argument('--epoch_size', | |||
| type=int, | |||
| default=5, | |||
| help='Training epochs.') | |||
| ### continue task parameters | |||
| parser.add_argument('--ckpt_load_name', | |||
| help='model name to save/load', | |||
| default= '') | |||
| parser.add_argument('--ckpt_save_name', | |||
| help='model name to save/load', | |||
| default= 'checkpoint') | |||
| if __name__ == "__main__": | |||
| args, unknown = parser.parse_known_args() | |||
| ###Initialize and copy data to training image | |||
| openi_context = prepare() | |||
| data_dir = openi_context.dataset_path | |||
| pretrain_model_dir = openi_context.pretrain_model_path | |||
| train_dir = openi_context.output_path | |||
| device_num = int(os.getenv('RANK_SIZE')) | |||
| ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) | |||
| if ds_train.get_dataset_size() == 0: | |||
| raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
| network = LeNet5(cfg.num_classes) | |||
| net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
| net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
| time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
| ### 继续训练模型加载 | |||
| if args.ckpt_load_name: | |||
| obs_copy_folder(args.train_url, base_path) | |||
| load_path = "{}/{}.ckpt".format(base_path,args.ckpt_load_name) | |||
| param_dict = load_checkpoint(load_path) | |||
| load_param_into_net(network, param_dict) | |||
| print("Successfully load ckpt file:{}, saved_net_work:{}".format(load_path,param_dict)) | |||
| ### 保存已有模型名避免重复回传结果 | |||
| outputFiles = os.listdir(base_path) | |||
| if args.device_target != "Ascend": | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}) | |||
| else: | |||
| model = Model(network, | |||
| net_loss, | |||
| net_opt, | |||
| metrics={"accuracy": Accuracy()}, | |||
| amp_level="O2") | |||
| config_ck = CheckpointConfig( | |||
| save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In this example, get_rank() is added to distinguish different paths. | |||
| if device_num == 1: | |||
| save_path = base_path + "/" | |||
| if device_num > 1: | |||
| save_path = base_path + "/" + str(get_rank()) + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name, | |||
| directory=save_path, | |||
| config=config_ck) | |||
| print("============== Starting Training ==============") | |||
| epoch_size = cfg['epoch_size'] | |||
| if (args.epoch_size): | |||
| epoch_size = args.epoch_size | |||
| print('epoch_size is: ', epoch_size) | |||
| model.train(epoch_size, | |||
| ds_train, | |||
| callbacks=[time_cb, ckpoint_cb, | |||
| LossMonitor()]) | |||
| ### 将训练容器中的新输出模型 回传到启智社区 | |||
| outputFilesNew = os.listdir(base_path) | |||
| new_models = [i for i in outputFilesNew if i not in outputFiles] | |||
| for n in new_models: | |||
| ckpt_url = base_path + "/" + n | |||
| obs_ckpt_url = args.train_url + "/" + n | |||
| obs_copy_file(ckpt_url, obs_ckpt_url) | |||
| @@ -30,7 +30,7 @@ from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import init, get_rank | |||
| import time | |||
| #导入openi包 | |||
| from c2net.context import prepare, upload_output | |||
| from c2net.context import prepare | |||
| parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
| @@ -67,6 +67,7 @@ if __name__ == "__main__": | |||
| mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData" | |||
| #获取预训练模型路径 | |||
| mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts" | |||
| output_path = c2net_context.output_path | |||
| #Set a cache file to determine whether the data has been copied to obs. | |||
| #If this file exists during multi-card training, there is no need to copy the dataset multiple times. | |||
| f = open("/cache/download_input.txt", 'w') | |||
| @@ -102,7 +103,7 @@ if __name__ == "__main__": | |||
| keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
| #Note that this method saves the model file on each card. You need to specify the save path on each card. | |||
| # In this example, get_rank() is added to distinguish different paths. | |||
| outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/" | |||
| outputDirectory = output_path + "/" + str(get_rank()) + "/" | |||
| ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
| directory=outputDirectory, | |||
| config=config_ck) | |||
| @@ -1,15 +0,0 @@ | |||
| #导入包 | |||
| import os | |||
| from c2net.context import prepare, upload_output | |||
| #初始化导入数据集和预训练模型到容器内 | |||
| c2net_context = prepare() | |||
| #获取数据集路径,预训练模型路径,输出路径 | |||
| dataset_path = c2net_context.dataset_path | |||
| pretrain_model_path = c2net_context.pretrain_model_path | |||
| output_path = c2net_context.output_path | |||
| #回传结果到openi | |||
| upload_output() | |||