Browse Source

update

liuzx-patch-1
liuzx 1 year ago
parent
commit
8863478571
18 changed files with 393 additions and 467 deletions
  1. +0
    -227
      OpenI云脑使用教程.ipynb
  2. +3
    -3
      README.md
  3. +0
    -51
      gcu_mnist_example/README.md
  4. +1
    -1
      gcu_mnist_example/model.py
  5. +1
    -1
      gcu_mnist_example/train.py
  6. +53
    -0
      gpgpu_mnist_example/README.md
  7. +82
    -0
      gpgpu_mnist_example/inference.py
  8. +35
    -0
      gpgpu_mnist_example/model.py
  9. +15
    -14
      gpu_mnist_example/README.md
  10. +82
    -0
      gpu_mnist_example/inference.py
  11. +2
    -6
      gpu_mnist_example/train.py
  12. +17
    -22
      npu_mnist_example/README.md
  13. +90
    -0
      npu_mnist_example/inference.py
  14. +3
    -2
      npu_mnist_example/read_imagenet.py
  15. +6
    -1
      npu_mnist_example/train.py
  16. +0
    -122
      npu_mnist_example/train_continue.py
  17. +3
    -2
      npu_mnist_example/train_multi_card.py
  18. +0
    -15
      train.py

+ 0
- 227
OpenI云脑使用教程.ipynb View File

@@ -1,227 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7d112f9b-84ba-420d-a52b-9eb7ba307068",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Looking in indexes: http://pip.modelarts.private.com:8888/repository/pypi/simple\n",
"Requirement already satisfied: openi-test==0.7.1 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (0.7.1)\n",
"Requirement already satisfied: requests in /home/ma-user/modelarts-dev/modelarts-sdk (from openi-test==0.7.1) (2.28.2)\n",
"Requirement already satisfied: tqdm in /home/ma-user/modelarts-dev/modelarts-sdk (from openi-test==0.7.1) (4.64.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/ma-user/modelarts-dev/modelarts-sdk (from requests->openi-test==0.7.1) (3.3.2)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (from requests->openi-test==0.7.1) (1.26.12)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/ma-user/modelarts-dev/modelarts-sdk (from requests->openi-test==0.7.1) (3.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/ma-user/anaconda3/envs/python-3.7.10/lib/python3.7/site-packages (from requests->openi-test==0.7.1) (2022.6.15)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install openi-test==0.7.1"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "02ad2e02-6533-4da0-98c3-c5f238d4d8f7",
"metadata": {},
"outputs": [],
"source": [
"#导入包\n",
"from openi.context import prepare, upload_openi"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "69880626-9320-46cd-ad29-8e5f7be09f32",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:root:Using MoXing-v2.1.0.5d9c87c8-5d9c87c8\n",
"INFO:root:Using OBS-Python-SDK-3.20.9.1\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"🎉 Successfully Download s3:///urchincache/attachment/d/d/ddabdf57-a65a-496c-bef0-19d82b9043cd/MNISTData.zip to /home/ma-user/work/dataset/MNISTData.zip\n",
"🎉 Successfully Extracted /home/ma-user/work/dataset/MNISTData.zip\n",
"🎉 Successfully Deleted /home/ma-user/work/dataset/MNISTData.zip\n",
"🎉 Successfully Download s3:///urchincache/attachment/2/c/2c59be66-64ec-41ca-b311-f51a486eabf8/checkpoint_lenet-1_1875.zip to /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n",
"🎉 Successfully Extracted /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n",
"🎉 Successfully Deleted /home/ma-user/work/dataset/checkpoint_lenet-1_1875.zip\n",
"🎉 Successfully Download s3:///urchincache/aimodels/0/c/0cf4367b-5234-4967-a41f-f548d3f69fcf/ to /home/ma-user/work/pretrainmodel/MNIST_Example_model_zjdt\n",
"please set the output location to /home/ma-user/work/output\n"
]
}
],
"source": [
"\n",
"#初始化导入数据集和预训练模型到容器内\n",
"openi_context = prepare()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c586f98f-bead-4dc9-a22f-173a672d456b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ma-user/work/dataset\n"
]
},
{
"data": {
"text/plain": [
"['checkpoint_lenet-1_1875', 'MNISTData']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#获取数据集路径,预训练模型路径,输出路径\n",
"dataset_path = openi_context.dataset_path\n",
"print(dataset_path)\n",
"\n",
"import os\n",
"os.listdir(dataset_path)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "7d6617f0-7b86-4b1b-a201-ecdc58db53a5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ma-user/work/pretrainmodel\n"
]
},
{
"data": {
"text/plain": [
"['MNIST_Example_model_zjdt']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pretrain_model_path = openi_context.pretrain_model_path\n",
"print(pretrain_model_path)\n",
"os.listdir(pretrain_model_path)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "6bc51211-5555-452e-9d83-adcfee1c4f79",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ma-user/work/output\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_path = openi_context.output_path\n",
"print(output_path)\n",
"os.listdir(output_path)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "48b5da5d-a55f-4781-9056-b886d41779c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"upload /home/ma-user/work/output to openi\n"
]
},
{
"data": {
"text/plain": [
"'/home/ma-user/work/output'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载\n",
"upload_openi()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75e7ce04-594e-4e8f-8292-15241709eb5e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "python-3.7.10",
"language": "python",
"name": "python-3.7.10"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

+ 3
- 3
README.md View File

@@ -12,7 +12,7 @@ pip install -U c2net-beta

```
#导入包
from c2net.context import prepare, upload_output
from c2net.context import prepare

#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
@@ -22,12 +22,12 @@ dataset_path = c2net_context.dataset_path
pretrain_model_path = c2net_context.pretrain_model_path
output_path = c2net_context.output_path

#回传结果到openi,训练任务才能回传,调试任务回传后也是不支持下载
upload_output()
#必须将输出结果保存到c2net_context.output_path,才能回传结果到openi,并且训练任务才能回传,调试任务回传后暂时不支持下载
```

## 2. 手写数字识别示例

* GPU示例请参考[gpu_mnist_example](./gpu_mnist_example/README.md)
* GPGPU示例请参考[gpgpu_mnist_example](./gpgpu_mnist_example/README.md)
* NPU示例请参考[npu_mnist_example](./npu_mnist_example/README.md)
* GCU示例请参考[gcu_mnist_example](./gcu_mnist_example/README.md)

+ 0
- 51
gcu_mnist_example/README.md View File

@@ -1,53 +1,3 @@
<<<<<<< HEAD
# 如何在启智平台上进行模型训练—GCU示例

## 1.启智集群和智算集群的GCU训练样例

###### 启智集群的示例代码:

- 训练示例请参考示例中[train_gcu.py](./train_gcu.py)的代码注释

## 2. 在openi上获取数据集,预训练模型,输出路径

安装openi包

```
pip install -U openi
```

使用openi包

```
#导入包
from openi.context import prepare, upload_openi

#初始化导入数据集和预训练模型到容器内
openi_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path
output_path = openi_context.output_path

#回传结果到openi
upload_openi()
```

## 3.FAQ

### 3.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi):

主要使用的方法有以下几个:

```
prepare 准备数据集,模型,输出路径
upload_openi 将训练镜像的输出结果拷贝回启智平台
```

### 3.2 解决参数报错问题:

请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`, `--multi_date_url`等参数报错问题
=======
# 如何在启智平台上进行模型训练 - GCU版本

- 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分。数据加载方式、模型定义逻辑大致同[手写数字识别GPU版本_PytorchExample](https://openi.pcl.ac.cn/OpenIOSSG/MNIST_PytorchExample_GPU)项目:
@@ -166,6 +116,5 @@ upload_openi 将训练镜像的输出结果拷贝回启智平台
目前训练任务的日志在代码中print输出,参考示例train_for_c2net.py代码相关print

### 4.2 训练结束后可以下载模型文件
>>>>>>> origin/liuzx

## 对于示例代码有任何问题,欢迎在本项目中提issue。

+ 1
- 1
gcu_mnist_example/model.py View File

@@ -32,4 +32,4 @@ class Model(Module):
y = self.relu4(y)
y = self.fc3(y)
y = self.relu5(y)
return y
return y

gcu_mnist_example/train_gcu.py → gcu_mnist_example/train.py View File

@@ -140,4 +140,4 @@ if __name__ == '__main__':
#The model output location is placed under /tmp/output
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1}
torch.save(state, '{}/mnist_epoch{}_{:.2f}.pkl'.format(c2net_context.output_path, _epoch+1, correct / _sum))
print(os.listdir('{}'.format(c2net_context.output_path)))
print(os.listdir('{}'.format(c2net_context.output_path)))

+ 53
- 0
gpgpu_mnist_example/README.md View File

@@ -0,0 +1,53 @@
# 如何在启智平台上进行模型训练—GPGPU示例

## 1.启智集群和智算集群的GPGPU训练样例

###### 启智集群的示例代码:

- 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释

## 2. 在云脑上获取数据集,预训练模型,输出路径

安装c2net包

```
pip install -U c2net-beta
```

使用c2net包

```
#导入包
from c2net.context import prepare,upload_output

#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = c2net_context.dataset_path
pretrain_model_path = c2net_context.pretrain_model_path
output_path = openi_context.output_path

#回传结果
upload_output()

```

## 3.[创建GPU训练示例任务界面教程](./Example_Picture/快速创建GPU训练任务.md)

## 4.FAQ

### 4.1 关于启智平台公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi):

主要使用的方法有以下几个:

```
prepare 准备数据集,模型,输出路径
upload_output 将训练镜像的输出结果拷贝回启智平台
```

### 4.2 解决参数报错问题:

请在代码中加入 `args, unknown = parser.parse_known_args()`,可忽略掉 `--ckpt_url`,`--data_url`, `--multi_date_url`等参数报错问题

## 对于示例代码有任何问题,欢迎在本项目中提issue。

+ 82
- 0
gpgpu_mnist_example/inference.py View File

@@ -0,0 +1,82 @@
#!/usr/bin/python
#coding=utf-8
'''
If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

1,The dataset structure of the single-dataset in this example
MnistDataset_torch.zip
├── test
└── train

'''
from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
import os
#导入c2net包
from c2net.context import prepare

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')

# 参数声明
WORKERS = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Model().to(device)
optimizer = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
# 模型测试
def test(model, test_loader, data_length):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for i, data in enumerate(test_loader, 0):
x, y = data
x = x.to(device)
y = y.to(device)
y_hat = model(x)
test_loss += cost(y_hat, y).item()
pred = y_hat.max(1, keepdim=True)[1]
correct += pred.eq(y.view_as(pred)).sum().item()
test_loss /= (i+1)

# 结果写入输出文件夹
filename = 'result.txt'
file_path = os.path.join('/tmp/output', filename)
with open(file_path, 'w') as file:
file.write('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, data_length, 100. * correct / data_length))


if __name__ == '__main__':
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875"
MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
#log output
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
epochs = args.epoch_size
test_dataset = mnist.MNIST(root=mnist_example_test2_model_djts_path + "/test", train=False, transform=ToTensor(),download=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
checkpoint = torch.load(mnist_example_test2_model_djts_path + "/mnist_epoch1_0.73.pkl")
model.load_state_dict(checkpoint['model'])
test(model,test_loader,len(test_dataset))

+ 35
- 0
gpgpu_mnist_example/model.py View File

@@ -0,0 +1,35 @@
from torch.nn import Module
from torch import nn


class Model(Module):
def __init__(self):
super(Model, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 5)
self.relu1 = nn.ReLU()
self.pool1 = nn.MaxPool2d(2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.relu2 = nn.ReLU()
self.pool2 = nn.MaxPool2d(2)
self.fc1 = nn.Linear(256, 120)
self.relu3 = nn.ReLU()
self.fc2 = nn.Linear(120, 84)
self.relu4 = nn.ReLU()
self.fc3 = nn.Linear(84, 10)
self.relu5 = nn.ReLU()

def forward(self, x):
y = self.conv1(x)
y = self.relu1(y)
y = self.pool1(y)
y = self.conv2(y)
y = self.relu2(y)
y = self.pool2(y)
y = y.view(y.shape[0], -1)
y = self.fc1(y)
y = self.relu3(y)
y = self.fc2(y)
y = self.relu4(y)
y = self.fc3(y)
y = self.relu5(y)
return y

+ 15
- 14
gpu_mnist_example/README.md View File

@@ -4,46 +4,47 @@

###### 启智集群的示例代码:

- 训练示例请参考示例中[train_gpu.py](./train_gpu.py)的代码注释
- 训练任务示例请参考示例中[train.py](./train.py)的代码注释
- 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释

## 2. 在openi上获取数据集,预训练模型,输出路径
## 2. 在云脑上获取数据集,预训练模型,输出路径

安装openi
安装c2net

```
pip install -U openi
pip install -U c2net-beta
```

使用openi
使用c2net

```
#导入包
from openi.context import prepare, upload_openi
from c2net.context import prepare,upload_output

#初始化导入数据集和预训练模型到容器内
openi_context = prepare()
c2net_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path
dataset_path = c2net_context.dataset_path
pretrain_model_path = c2net_context.pretrain_model_path
output_path = openi_context.output_path

#回传结果到openi
upload_openi()
#回传结果
upload_output()

```

## 3.[创建GPU训练示例任务界面教程](./Example_Picture/快速创建GPU训练任务.md)

## 4.FAQ

### 4.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi):
### 4.1 关于启智平台公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi):

主要使用的方法有以下几个:

```
prepare 准备数据集,模型,输出路径
upload_openi 将训练镜像的输出结果拷贝回启智平台

upload_output 将训练镜像的输出结果拷贝回启智平台
```

### 4.2 解决参数报错问题:


+ 82
- 0
gpu_mnist_example/inference.py View File

@@ -0,0 +1,82 @@
#!/usr/bin/python
#coding=utf-8
'''
If there are Chinese comments in the code,please add at the beginning:
#!/usr/bin/python
#coding=utf-8

1,The dataset structure of the single-dataset in this example
MnistDataset_torch.zip
├── test
└── train

'''
from model import Model
import numpy as np
import torch
from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import argparse
import os
#导入c2net包
from c2net.context import prepare

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')

# 参数声明
WORKERS = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Model().to(device)
optimizer = SGD(model.parameters(), lr=1e-1)
cost = CrossEntropyLoss()
# 模型测试
def test(model, test_loader, data_length):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for i, data in enumerate(test_loader, 0):
x, y = data
x = x.to(device)
y = y.to(device)
y_hat = model(x)
test_loss += cost(y_hat, y).item()
pred = y_hat.max(1, keepdim=True)[1]
correct += pred.eq(y.view_as(pred)).sum().item()
test_loss /= (i+1)

# 结果写入输出文件夹
filename = 'result.txt'
file_path = os.path.join('/tmp/output', filename)
with open(file_path, 'w') as file:
file.write('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, data_length, 100. * correct / data_length))


if __name__ == '__main__':
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
checkpoint_lenet_1_1875_path = c2net_context.dataset_path+"/"+"checkpoint_lenet-1_1875"
MnistDataset_torch = c2net_context.dataset_path+"/"+"MnistDataset_torch"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
#log output
print('cuda is available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = args.batch_size
epochs = args.epoch_size
test_dataset = mnist.MNIST(root=mnist_example_test2_model_djts_path + "/test", train=False, transform=ToTensor(),download=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
model = Model().to(device)
checkpoint = torch.load(mnist_example_test2_model_djts_path + "/mnist_epoch1_0.73.pkl")
model.load_state_dict(checkpoint['model'])
test(model,test_loader,len(test_dataset))

gpu_mnist_example/train_gpu.py → gpu_mnist_example/train.py View File

@@ -11,9 +11,7 @@ If there are Chinese comments in the code,please add at the beginning:
└── train

'''
import os
os.system("pip install openi-test")
os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH")))


from model import Model
import numpy as np
@@ -26,7 +24,7 @@ from torchvision.transforms import ToTensor
import argparse
import os
#导入c2net包
from c2net.context import prepare, upload_output
from c2net.context import prepare

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
@@ -114,7 +112,5 @@ if __name__ == '__main__':
# 将模型保存到c2net_context.output_path
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch))
#回传结果
upload_output()



+ 17
- 22
npu_mnist_example/README.md View File

@@ -4,49 +4,44 @@

###### 启智集群的示例代码:

- 训练示例请参考示例中[train_npu.py](./train_npu.py)的代码注释
- 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释
- 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释
- 训练任务示例请参考示例中[inference.py](./inference.py)的代码注释
- 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像,具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释
- 继续训练功能示例代码请参考[train_continue.py](./train_continue.py) ,启智与智算的用法相同

## 2. 在openi上获取数据集,预训练模型,输出路径
## 2. 在云脑上获取数据集,预训练模型,输出路径

安装openi包

```
pip install -U openi
```

使用openi包
使用c2net包

```
#导入包
from openi.context import prepare, upload_openi
from c2net.context import prepare, upload_openi

#初始化导入数据集和预训练模型到容器内
openi_context = prepare()
c2net_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = openi_context.dataset_path
pretrain_model_path = openi_context.pretrain_model_path
output_path = openi_context.output_path
dataset_path = c2net_context.dataset_path
pretrain_model_path = c2net_context.pretrain_model_path
output_path = c2net_context.output_path

#回传结果到openi
upload_openi()
upload_output()
```

## 3.[创建NPU训练示例任务界面教程](./Example_Picture/快速创建NPU训练任务.md)

## 4.FAQ

### 4.1 关于启智平台公共库[openi](https://openi.pcl.ac.cn/OpenIOSSG/openi-pypi):
### 4.1 关于公共库[c2net](https://openi.pcl.ac.cn/OpenIOSSG/c2net-pypi):

主要使用的方法有以下几个:

```
prepare 准备数据集,模型,输出路径
upload_openi 将训练镜像的输出结果拷贝回启智平台
obs_copy_file 通过mox拷贝文件
obs_copy_folder 通过mox拷贝文件夹
prepare 准备数据集,模型,输出路径
c2net.context.upload_output 将训练镜像的输出结果拷贝回启智平台
c2net.context.moxing_helper.obs_copy_file 通过mox拷贝文件
c2net.context.moxing_helper.obs_copy_folder 通过mox拷贝文件夹
```

### 4.2 解决参数报错问题:
@@ -59,7 +54,7 @@ obs_copy_folder 通过mox拷贝文件夹

```
if local_rank%8==0:
这里省略下载数据的代码...(openi.context.prepare())
这里省略下载数据的代码...
f = open("/cache/download_input.txt", 'w')
f.close()
try:


+ 90
- 0
npu_mnist_example/inference.py View File

@@ -0,0 +1,90 @@


"""
示例选用的数据集是MNISTData.zip
数据集结构是:
MNISTData.zip
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
├── train-images-idx3-ubyte
└── train-labels-idx1-ubyte
使用注意事项:
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包
"""
import time
import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from lenet import LeNet5
import mindspore.nn as nn
import numpy as np
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore import Tensor
#导入c2net包
from c2net.context import prepare, upload_output


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

if __name__ == "__main__":
###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
args, unknown = parser.parse_known_args()
#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()
#获取数据集路径
mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
#获取输出路径
save_path = c2net_context.output_path
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
repeat_size = cfg.epoch_size
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
#model = Model(network, net_loss, net_opt, metrics={"Accuracy"})
model = Model(network, net_loss, net_opt)


print("============== Starting Testing ==============")
load_param_into_net(network, load_checkpoint(os.path.join(mnist_example_test2_model_djts_path, "checkpoint_lenet-1_1875.ckpt")))
ds_test = create_dataset(os.path.join(mnistdata_path, "test"), batch_size=1).create_dict_iterator()
data = next(ds_test)
images = data["image"].asnumpy()
labels = data["label"].asnumpy()
print('Tensor:', Tensor(data['image']))
output = model.predict(Tensor(data['image']))
predicted = np.argmax(output.asnumpy(), axis=1)
pred = np.argmax(output.asnumpy(), axis=1)
print('predicted:', predicted)
print('pred:', pred)

print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"')
filename = 'result.txt'
file_path = os.path.join(save_path, filename)
with open(file_path, 'a+') as file:
file.write(" {}: {:.2f} \n".format("Predicted", predicted[0]))

###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path
upload_output()

+ 3
- 2
npu_mnist_example/read_imagenet.py View File

@@ -36,7 +36,7 @@ import moxing as mox
import mindspore as ms
from mindspore.dataset import ImageFolderDataset
import mindspore.dataset.vision.c_transforms as transforms
from openi.context import upload_openi
from c2net.context import upload_output

parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example')
parser.add_argument('--train_url',
@@ -45,6 +45,7 @@ parser.add_argument('--train_url',

if __name__ == "__main__":
args, unknown = parser.parse_known_args()
#注意只有训练任务可用
data_path = '/cache/sfs/data/imagenet/'
modelart_output = '/cache/output'
if not os.path.exists(modelart_output):
@@ -69,4 +70,4 @@ if __name__ == "__main__":

data_info = dataset_train.to_json(filename= modelart_output + '/data_info.json')
print(data_info)
upload_openi()
upload_output()

npu_mnist_example/train_npu.py → npu_mnist_example/train.py View File

@@ -15,7 +15,9 @@
1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题
2、用户需要调用c2net的python sdk包
"""

import os
os.system("pip install c2net-beta -i https://pypi.tuna.tsinghua.edu.cn/simple")
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
@@ -29,6 +31,7 @@ import time
#导入c2net包
from c2net.context import prepare, upload_output


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument(
@@ -52,6 +55,8 @@ if __name__ == "__main__":
mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
#获取输出路径
output_path = c2net_context.output_path
context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
#使用数据集的方式
@@ -77,7 +82,7 @@ if __name__ == "__main__":
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#将模型保存到c2net_context.output_path
outputDirectory = c2net_context.output_path + "/"
outputDirectory = output_path + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)

+ 0
- 122
npu_mnist_example/train_continue.py View File

@@ -1,122 +0,0 @@
#####################################################################################################
# 继续训练功能:修改训练任务时,若勾选复用上次结果,则可在新训练任务的输出路径中读取到上次结果
#
# 示例用法
# - 增加两个训练参数
# 'ckpt_save_name' 此次任务的输出文件名,用于保存此次训练的模型文件名称(不带后缀)
# 'ckpt_load_name' 上一次任务的输出文件名,用于加载上一次输出的模型文件名称(不带后缀),首次训练默认为空,则不读取任何文件
# - 训练代码中判断 'ckpt_load_name' 是否为空,若不为空,则为继续训练任务
#####################################################################################################


import os
import argparse
from config import mnist_cfg as cfg
from dataset import create_dataset
from dataset_distributed import create_dataset_parallel
from lenet import LeNet5
import mindspore.nn as nn
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import load_checkpoint, load_param_into_net
from mindspore.train import Model
from mindspore.nn.metrics import Accuracy
from mindspore.communication.management import get_rank

#导入openi包
from openi.context import prepare, upload_openi
from openi.context.helper import obs_copy_file, obs_copy_folder

parser = argparse.ArgumentParser(description='MindSpore Lenet Example')

parser.add_argument(
'--device_target',
type=str,
default="Ascend",
choices=['Ascend', 'CPU'],
help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
type=int,
default=5,
help='Training epochs.')

### continue task parameters
parser.add_argument('--ckpt_load_name',
help='model name to save/load',
default= '')

parser.add_argument('--ckpt_save_name',
help='model name to save/load',
default= 'checkpoint')


if __name__ == "__main__":
args, unknown = parser.parse_known_args()

###Initialize and copy data to training image
openi_context = prepare()
data_dir = openi_context.dataset_path
pretrain_model_dir = openi_context.pretrain_model_path
train_dir = openi_context.output_path

device_num = int(os.getenv('RANK_SIZE'))
ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size)
if ds_train.get_dataset_size() == 0:
raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")

network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
### 继续训练模型加载
if args.ckpt_load_name:
obs_copy_folder(args.train_url, base_path)
load_path = "{}/{}.ckpt".format(base_path,args.ckpt_load_name)
param_dict = load_checkpoint(load_path)
load_param_into_net(network, param_dict)
print("Successfully load ckpt file:{}, saved_net_work:{}".format(load_path,param_dict))
### 保存已有模型名避免重复回传结果
outputFiles = os.listdir(base_path)

if args.device_target != "Ascend":
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()})
else:
model = Model(network,
net_loss,
net_opt,
metrics={"accuracy": Accuracy()},
amp_level="O2")

config_ck = CheckpointConfig(
save_checkpoint_steps=cfg.save_checkpoint_steps,
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
if device_num == 1:
save_path = base_path + "/"
if device_num > 1:
save_path = base_path + "/" + str(get_rank()) + "/"
ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name,
directory=save_path,
config=config_ck)
print("============== Starting Training ==============")
epoch_size = cfg['epoch_size']
if (args.epoch_size):
epoch_size = args.epoch_size
print('epoch_size is: ', epoch_size)
model.train(epoch_size,
ds_train,
callbacks=[time_cb, ckpoint_cb,
LossMonitor()])
### 将训练容器中的新输出模型 回传到启智社区
outputFilesNew = os.listdir(base_path)
new_models = [i for i in outputFilesNew if i not in outputFiles]
for n in new_models:
ckpt_url = base_path + "/" + n
obs_ckpt_url = args.train_url + "/" + n
obs_copy_file(ckpt_url, obs_ckpt_url)

npu_mnist_example/train_npu_multi_card.py → npu_mnist_example/train_multi_card.py View File

@@ -30,7 +30,7 @@ from mindspore.context import ParallelMode
from mindspore.communication.management import init, get_rank
import time
#导入openi包
from c2net.context import prepare, upload_output
from c2net.context import prepare


parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
@@ -67,6 +67,7 @@ if __name__ == "__main__":
mnistdata_path = c2net_context.dataset_path+"/"+"MNISTData"
#获取预训练模型路径
mnist_example_test2_model_djts_path = c2net_context.pretrain_model_path+"/"+"MNIST_Example_test2_model_djts"
output_path = c2net_context.output_path
#Set a cache file to determine whether the data has been copied to obs.
#If this file exists during multi-card training, there is no need to copy the dataset multiple times.
f = open("/cache/download_input.txt", 'w')
@@ -102,7 +103,7 @@ if __name__ == "__main__":
keep_checkpoint_max=cfg.keep_checkpoint_max)
#Note that this method saves the model file on each card. You need to specify the save path on each card.
# In this example, get_rank() is added to distinguish different paths.
outputDirectory = c2net_context.output_path + "/" + str(get_rank()) + "/"
outputDirectory = output_path + "/" + str(get_rank()) + "/"
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
directory=outputDirectory,
config=config_ck)

+ 0
- 15
train.py View File

@@ -1,15 +0,0 @@

#导入包
import os
from c2net.context import prepare, upload_output

#初始化导入数据集和预训练模型到容器内
c2net_context = prepare()

#获取数据集路径,预训练模型路径,输出路径
dataset_path = c2net_context.dataset_path
pretrain_model_path = c2net_context.pretrain_model_path
output_path = c2net_context.output_path

#回传结果到openi
upload_output()

Loading…
Cancel
Save