上传文件至 'gcu'

2 years ago · 33c13668f9
--- a/gcu/model.py
+++ b/gcu/model.py
@@ -0,0 +1,35 @@
 from torch.nn import Module
 from torch import nn


 class Model(Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(256, 120)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(120, 84)
        self.relu4 = nn.ReLU()
        self.fc3 = nn.Linear(84, 10)
        self.relu5 = nn.ReLU()

    def forward(self, x):
        y = self.conv1(x)
        y = self.relu1(y)
        y = self.pool1(y)
        y = self.conv2(y)
        y = self.relu2(y)
        y = self.pool2(y)
        y = y.view(y.shape[0], -1)
        y = self.fc1(y)
        y = self.relu3(y)
        y = self.fc2(y)
        y = self.relu4(y)
        y = self.fc3(y)
        y = self.relu5(y)
        return y
--- a/gcu/train_for_c2net.py
+++ b/gcu/train_for_c2net.py
@@ -0,0 +1,147 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   

 In the training environment, 
 the code will be automatically placed in the /tmp/code directory, 
 the uploaded dataset will be automatically placed in the /tmp/dataset directory

 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /tmp/dataset/train, /dataset/test;

 The dataset structure of the single dataset in the training image in this example:
  tmp
    ├──dataset 
         ├── test
         └── train 

 If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
 the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
 and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The dataset structure in the training image for multiple datasets in this example:
 tmp
  ├──dataset
     ├── MnistDataset_torch
     |     ├── test
     |     └── train 
     └── checkpoint_epoch1_0.73 
           ├── mnist_epoch1_0.73.pkl


 the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, 
 qizhi platform will provide file downloads under the /tmp/output directory.

 In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, 
 which is written as: 
 import os
 os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 '''

 import torch
 from model import Model
 import numpy as np
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os

 import importlib.util

 def is_torch_dtu_available():
    if importlib.util.find_spec("torch_dtu") is None:
        return False
    if importlib.util.find_spec("torch_dtu.core") is None:
        return False
    return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None

 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
 parser.add_argument('--save_url', default="/tmp/output" ,help='path to train dataset')

 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')


 if __name__ == '__main__':
    # load DPU envs-xx.sh
    DTU_FLAG = True
    if is_torch_dtu_available():
        import torch_dtu
        import torch_dtu.distributed as dist
        import torch_dtu.core.dtu_model as dm
        from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP
        print('dtu is available: True') 
        device = dm.dtu_device()
        DTU_FLAG = True
    else:
        print('dtu is available: False')
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        DTU_FLAG = False
        
    
    # 参数声明
    model = Model().to(device)
    optimizer = SGD(model.parameters(), lr=1e-1)
    args, unknown = parser.parse_known_args()
    #log output
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print('epoch_size is:{}'.format(epoch))
    
    if not os.path.exists(args.save_url):
        os.makedirs(args.save_url, exist_ok=True)

    for _epoch in range(epoch):
        print('the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            if DTU_FLAG:
                dm.optimizer_step(sgd, barrier=True)
            else:
                sgd.step()
            
            
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print('accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /tmp/output
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
        print('test:')
        print(os.listdir("/tmp/output"))