|
- #!/usr/bin/python
- #coding=utf-8
- '''
- If there are Chinese comments in the code,please add at the beginning:
- #!/usr/bin/python
- #coding=utf-8
-
- 示例选用的数据集是MnistDataset_torch.zip
- 数据集结构是:
- MnistDataset_torch.zip
- ├── test
- │ ├── MNIST/processed/test.pt
- │ └── MNIST/processed/training.pt
- │ ├── MNIST/raw/train-images-idx3-ubyte
- │ └── MNIST/raw/train-labels-idx1-ubyte
- │ ├── MNIST/raw/t10k-images-idx3-ubyte
- │ └── MNIST/raw/t10k-labels-idx1-ubyte
- ├── train
- │ ├── MNIST/processed/test.pt
- │ └── MNIST/processed/training.pt
- │ ├── MNIST/raw/train-images-idx3-ubyte
- │ └── MNIST/raw/train-labels-idx1-ubyte
- │ ├── MNIST/raw/t10k-images-idx3-ubyte
- │ └── MNIST/raw/t10k-labels-idx1-ubyte
-
-
-
- 示例选用的预训练模型文件为:mnist_epoch1_0.86.pkl
-
- '''
-
- import os
- print("begin:")
- os.system("pip install {}".format(os.getenv("OPENI_SDK_PATH")))
- import torch
- from model import Model
- import numpy as np
- from torchvision.datasets import mnist
- from torch.nn import CrossEntropyLoss
- from torch.optim import SGD
- from torch.utils.data import DataLoader
- from torchvision.transforms import ToTensor
- import argparse
- from openi.context import prepare, upload_openi
-
- import importlib.util
-
- def is_torch_dtu_available():
- if importlib.util.find_spec("torch_dtu") is None:
- return False
- if importlib.util.find_spec("torch_dtu.core") is None:
- return False
- return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None
-
- # Training settings
- parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
- parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
- parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
-
-
- if __name__ == '__main__':
- #获取参数并忽略超参数报错
- args, unknown = parser.parse_known_args()
-
- #初始化导入数据集和预训练模型到容器内
- openi_context = prepare()
-
- #获取数据集路径,预训练模型路径,输出路径
- dataset_path = openi_context.dataset_path
- pretrain_model_path = openi_context.pretrain_model_path
- output_path = openi_context.output_path
-
- dataset_path_A = dataset_path + "/MnistDataset"
- pretrain_model_path_A = pretrain_model_path + "/MNIST_PytorchExample_GPU_test34_model_7f9j"
-
- print("dataset_path:")
- print(os.listdir(dataset_path))
- os.listdir(dataset_path)
-
- print("pretrain_model_path:")
- print(os.listdir(pretrain_model_path))
- os.listdir(pretrain_model_path)
-
- print("output_path:")
- print(os.listdir(output_path))
- os.listdir(output_path)
- # load DPU envs-xx.sh
- DTU_FLAG = True
- if is_torch_dtu_available():
- import torch_dtu
- import torch_dtu.distributed as dist
- import torch_dtu.core.dtu_model as dm
- from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP
- print('dtu is available: True')
- device = dm.dtu_device()
- DTU_FLAG = True
- else:
- print('dtu is available: False')
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
- DTU_FLAG = False
-
-
- # 参数声明
- model = Model().to(device)
- optimizer = SGD(model.parameters(), lr=1e-1)
- #log output
- batch_size = args.batch_size
- train_dataset = mnist.MNIST(root=dataset_path_A + "/train", train=True, transform=ToTensor(),download=False)
- test_dataset = mnist.MNIST(root=dataset_path_A + "/test", train=False, transform=ToTensor(),download=False)
- train_loader = DataLoader(train_dataset, batch_size=batch_size)
- test_loader = DataLoader(test_dataset, batch_size=batch_size)
- model = Model().to(device)
- sgd = SGD(model.parameters(), lr=1e-1)
- cost = CrossEntropyLoss()
- epochs = args.epoch_size
- print('epoch_size is:{}'.format(epochs))
-
- # 如果有保存的模型,则加载模型,并在其基础上继续训练
- if os.path.exists(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl"):
- checkpoint = torch.load(pretrain_model_path_A+"/mnist_epoch1_0.70.pkl")
- model.load_state_dict(checkpoint['model'])
- optimizer.load_state_dict(checkpoint['optimizer'])
- start_epoch = checkpoint['epoch']
- print('加载 epoch {} 权重成功!'.format(start_epoch))
- else:
- start_epoch = 0
- print('无保存模型,将从头开始训练!')
-
- for _epoch in range(start_epoch, epochs):
- print('the {} epoch_size begin'.format(_epoch + 1))
- model.train()
- for idx, (train_x, train_label) in enumerate(train_loader):
- train_x = train_x.to(device)
- train_label = train_label.to(device)
- label_np = np.zeros((train_label.shape[0], 10))
- sgd.zero_grad()
- predict_y = model(train_x.float())
- loss = cost(predict_y, train_label.long())
- if idx % 10 == 0:
- print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
- loss.backward()
- if DTU_FLAG:
- dm.optimizer_step(sgd, barrier=True)
- else:
- sgd.step()
-
-
- correct = 0
- _sum = 0
- model.eval()
- for idx, (test_x, test_label) in enumerate(test_loader):
- test_x = test_x
- test_label = test_label
- predict_y = model(test_x.to(device).float()).detach()
- predict_ys = np.argmax(predict_y.cpu(), axis=-1)
- label_np = test_label.numpy()
- _ = predict_ys == test_label
- correct += np.sum(_.numpy(), axis=-1)
- _sum += _.shape[0]
- print('accuracy: {:.2f}'.format(correct / _sum))
- #The model output location is placed under /tmp/output
- state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':_epoch+1}
- torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
- print('test:')
- print(os.listdir("/tmp/output"))
|