From 33c13668f92849727adfde3069e3f9a0e526e71d Mon Sep 17 00:00:00 2001 From: wjtest1215 Date: Thu, 23 Mar 2023 19:38:09 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'gcu'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gcu/model.py | 35 ++++++++++ gcu/train_for_c2net.py | 147 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 gcu/model.py create mode 100644 gcu/train_for_c2net.py diff --git a/gcu/model.py b/gcu/model.py new file mode 100644 index 0000000..157bad6 --- /dev/null +++ b/gcu/model.py @@ -0,0 +1,35 @@ +from torch.nn import Module +from torch import nn + + +class Model(Module): + def __init__(self): + super(Model, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.relu1 = nn.ReLU() + self.pool1 = nn.MaxPool2d(2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.relu2 = nn.ReLU() + self.pool2 = nn.MaxPool2d(2) + self.fc1 = nn.Linear(256, 120) + self.relu3 = nn.ReLU() + self.fc2 = nn.Linear(120, 84) + self.relu4 = nn.ReLU() + self.fc3 = nn.Linear(84, 10) + self.relu5 = nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + y = self.pool1(y) + y = self.conv2(y) + y = self.relu2(y) + y = self.pool2(y) + y = y.view(y.shape[0], -1) + y = self.fc1(y) + y = self.relu3(y) + y = self.fc2(y) + y = self.relu4(y) + y = self.fc3(y) + y = self.relu5(y) + return y \ No newline at end of file diff --git a/gcu/train_for_c2net.py b/gcu/train_for_c2net.py new file mode 100644 index 0000000..b9d5ac1 --- /dev/null +++ b/gcu/train_for_c2net.py @@ -0,0 +1,147 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +In the training environment, +the code will be automatically placed in the /tmp/code directory, +the uploaded dataset will be automatically placed in the /tmp/dataset directory + +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /tmp/dataset/train, /dataset/test; + +The dataset structure of the single dataset in the training image in this example: + tmp + ├──dataset + ├── test + └── train + +If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, +the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test +and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl +The dataset structure in the training image for multiple datasets in this example: +tmp + ├──dataset + ├── MnistDataset_torch + | ├── test + | └── train + └── checkpoint_epoch1_0.73 + ├── mnist_epoch1_0.73.pkl + + +the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, +qizhi platform will provide file downloads under the /tmp/output directory. + +In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, +which is written as: +import os +os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") +''' + +import torch +from model import Model +import numpy as np +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +import importlib.util + +def is_torch_dtu_available(): + if importlib.util.find_spec("torch_dtu") is None: + return False + if importlib.util.find_spec("torch_dtu.core") is None: + return False + return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--save_url', default="/tmp/output" ,help='path to train dataset') + +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + + +if __name__ == '__main__': + # load DPU envs-xx.sh + DTU_FLAG = True + if is_torch_dtu_available(): + import torch_dtu + import torch_dtu.distributed as dist + import torch_dtu.core.dtu_model as dm + from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP + print('dtu is available: True') + device = dm.dtu_device() + DTU_FLAG = True + else: + print('dtu is available: False') + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + DTU_FLAG = False + + + # 参数声明 + model = Model().to(device) + optimizer = SGD(model.parameters(), lr=1e-1) + args, unknown = parser.parse_known_args() + #log output + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print('epoch_size is:{}'.format(epoch)) + + if not os.path.exists(args.save_url): + os.makedirs(args.save_url, exist_ok=True) + + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + if DTU_FLAG: + dm.optimizer_step(sgd, barrier=True) + else: + sgd.step() + + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /tmp/output + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) + print('test:') + print(os.listdir("/tmp/output")) \ No newline at end of file