From 66fa99a88a0ab9994d0c6cc8a44013dec83fa494 Mon Sep 17 00:00:00 2001 From: wj0903 Date: Sat, 3 Sep 2022 17:04:49 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20'gpu/train=5Ffail3.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gpu/train_fail3.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 gpu/train_fail3.py diff --git a/gpu/train_fail3.py b/gpu/train_fail3.py new file mode 100644 index 0000000..5854d07 --- /dev/null +++ b/gpu/train_fail3.py @@ -0,0 +1,93 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the codeļ¼Œplease add at the beginningļ¼š +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import datetime + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +def gettime(): + timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return timestr + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print(gettime(), 'epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) + print("----------this is the end--------") + print(a)