|
- '''
- 1,本示例中多数据集训练上传的数据集结构
- MnistDataset_torch.zip
- ├── test
- └── train
-
- checkpoint_epoch1_0.73.zip
- ├── mnist_epoch1_0.73.pkl
-
- 2,由于a100的适配性问题,使用训练环境前请使用平台的含有cuda11以上的推荐镜像在调试环境中调试自己的代码,
- 本示例的镜像地址是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191,并
- 提交镜像,再切到训练环境训练已跑通的代码。
- 在训练环境中,上传的数据集会自动放在/dataset目录下,注意:选择单数据集和多数据集时的路径不同!
- (1)如果是单数据集:如选择的是MnistDataset_torch.zip,则数据集目录为/dataset/train、/dataset/test;
- 本示例中单数据集在训练镜像中的数据集结构
- dataset
- ├── test
- └── train
- (2)如选择的是多数据集,如选择的是MnistDataset_torch.zip和checkpoint_epoch1_0.73.zip,则数据集
- 目录为/dataset/MnistDataset_torch/train、/dataset/MnistDataset_torch/test
- 和/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
- 本示例中多数据集在训练镜像中的数据集结构
- dataset
- ├── MnistDataset_torch
- | ├── test
- | └── train
- └── checkpoint_epoch1_0.73
- ├── mnist_epoch1_0.73.pkl
-
-
- 模型下载路径默认在/model下,请将模型输出位置指定到/model,启智平台界面会提供/model目录下的文件下载。
-
-
-
- '''
-
-
- from model import Model
- import numpy as np
- import torch
- from torchvision.datasets import mnist
- from torch.nn import CrossEntropyLoss
- from torch.optim import SGD
- from torch.utils.data import DataLoader
- from torchvision.transforms import ToTensor
- import argparse
-
- # Training settings
- parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
- #数据集位置放在/dataset下
- parser.add_argument('--traindata', default="/dataset/MnistDataset_torch/train" ,help='path to train dataset')
- parser.add_argument('--testdata', default="/dataset/MnistDataset_torch/test" ,help='path to test dataset')
- parser.add_argument('--checkpoint', default="/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl" ,help='checkpoint file')
- parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
- parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
-
- if __name__ == '__main__':
- args = parser.parse_args()
- #日志输出
- print('cuda is available:{}'.format(torch.cuda.is_available()))
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
- batch_size = args.batch_size
- train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
- test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
- train_loader = DataLoader(train_dataset, batch_size=batch_size)
- test_loader = DataLoader(test_dataset, batch_size=batch_size)
- model = Model().to(device)
- sgd = SGD(model.parameters(), lr=1e-1)
- cost = CrossEntropyLoss()
- epoch = args.epoch_size
- #日志输出
- print('epoch_size is:{}'.format(epoch))
- #加载已训练好的模型:
- # path = args.checkpoint
- # checkpoint = torch.load(path, map_location=device)
- # model.load_state_dict(checkpoint)
- #开始训练
- for _epoch in range(epoch):
- print('the {} epoch_size begin'.format(_epoch + 1))
- model.train()
- for idx, (train_x, train_label) in enumerate(train_loader):
- train_x = train_x.to(device)
- train_label = train_label.to(device)
- label_np = np.zeros((train_label.shape[0], 10))
- sgd.zero_grad()
- predict_y = model(train_x.float())
- loss = cost(predict_y, train_label.long())
- if idx % 10 == 0:
- print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
- loss.backward()
- sgd.step()
-
- correct = 0
- _sum = 0
- model.eval()
- for idx, (test_x, test_label) in enumerate(test_loader):
- test_x = test_x
- test_label = test_label
- predict_y = model(test_x.to(device).float()).detach()
- predict_ys = np.argmax(predict_y.cpu(), axis=-1)
- label_np = test_label.numpy()
- _ = predict_ys == test_label
- correct += np.sum(_.numpy(), axis=-1)
- _sum += _.shape[0]
- #日志输出
- print('accuracy: {:.2f}'.format(correct / _sum))
- #模型输出位置放在/model下
- torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
|