From 33c13668f92849727adfde3069e3f9a0e526e71d Mon Sep 17 00:00:00 2001
From: wjtest1215 <fortestonly321@yeah.net>
Date: Thu, 23 Mar 2023 19:38:09 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?=
 =?UTF-8?q?=20'gcu'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 gcu/model.py           |  35 ++++++++++
 gcu/train_for_c2net.py | 147 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 gcu/model.py
 create mode 100644 gcu/train_for_c2net.py

diff --git a/gcu/model.py b/gcu/model.py
new file mode 100644
index 0000000..157bad6
--- /dev/null
+++ b/gcu/model.py
@@ -0,0 +1,35 @@
+from torch.nn import Module
+from torch import nn
+
+
+class Model(Module):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv1 = nn.Conv2d(1, 6, 5)
+        self.relu1 = nn.ReLU()
+        self.pool1 = nn.MaxPool2d(2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.relu2 = nn.ReLU()
+        self.pool2 = nn.MaxPool2d(2)
+        self.fc1 = nn.Linear(256, 120)
+        self.relu3 = nn.ReLU()
+        self.fc2 = nn.Linear(120, 84)
+        self.relu4 = nn.ReLU()
+        self.fc3 = nn.Linear(84, 10)
+        self.relu5 = nn.ReLU()
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.relu1(y)
+        y = self.pool1(y)
+        y = self.conv2(y)
+        y = self.relu2(y)
+        y = self.pool2(y)
+        y = y.view(y.shape[0], -1)
+        y = self.fc1(y)
+        y = self.relu3(y)
+        y = self.fc2(y)
+        y = self.relu4(y)
+        y = self.fc3(y)
+        y = self.relu5(y)
+        return y
\ No newline at end of file
diff --git a/gcu/train_for_c2net.py b/gcu/train_for_c2net.py
new file mode 100644
index 0000000..b9d5ac1
--- /dev/null
+++ b/gcu/train_for_c2net.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python
+#coding=utf-8
+'''
+If there are Chinese comments in the code，please add at the beginning：
+#!/usr/bin/python
+#coding=utf-8   
+
+In the training environment, 
+the code will be automatically placed in the /tmp/code directory, 
+the uploaded dataset will be automatically placed in the /tmp/dataset directory
+
+Note: the paths are different when selecting a single dataset and multiple datasets.
+(1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
+   the dataset directory is /tmp/dataset/train, /dataset/test;
+
+The dataset structure of the single dataset in the training image in this example:
+  tmp
+    ├──dataset 
+         ├── test
+         └── train 
+
+If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
+the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
+and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
+The dataset structure in the training image for multiple datasets in this example:
+tmp
+  ├──dataset
+     ├── MnistDataset_torch
+     |     ├── test
+     |     └── train 
+     └── checkpoint_epoch1_0.73 
+           ├── mnist_epoch1_0.73.pkl
+
+
+the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, 
+qizhi platform will provide file downloads under the /tmp/output directory.
+
+In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, 
+which is written as: 
+import os
+os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
+'''
+
+import torch
+from model import Model
+import numpy as np
+from torchvision.datasets import mnist
+from torch.nn import CrossEntropyLoss
+from torch.optim import SGD
+from torch.utils.data import DataLoader
+from torchvision.transforms import ToTensor
+import argparse
+import os
+
+import importlib.util
+
+def is_torch_dtu_available():
+    if importlib.util.find_spec("torch_dtu") is None:
+        return False
+    if importlib.util.find_spec("torch_dtu.core") is None:
+        return False
+    return importlib.util.find_spec("torch_dtu.core.dtu_model") is not None
+
+# Training settings
+parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+#The dataset location is placed under /dataset
+parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
+parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
+parser.add_argument('--save_url', default="/tmp/output" ,help='path to train dataset')
+
+parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
+parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
+
+
+if __name__ == '__main__':
+    # load DPU envs-xx.sh
+    DTU_FLAG = True
+    if is_torch_dtu_available():
+        import torch_dtu
+        import torch_dtu.distributed as dist
+        import torch_dtu.core.dtu_model as dm
+        from torch_dtu.nn.parallel import DistributedDataParallel as torchDDP
+        print('dtu is available: True') 
+        device = dm.dtu_device()
+        DTU_FLAG = True
+    else:
+        print('dtu is available: False')
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        DTU_FLAG = False
+        
+    
+    # 参数声明
+    model = Model().to(device)
+    optimizer = SGD(model.parameters(), lr=1e-1)
+    args, unknown = parser.parse_known_args()
+    #log output
+    batch_size = args.batch_size
+    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
+    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size)
+    model = Model().to(device)
+    sgd = SGD(model.parameters(), lr=1e-1)
+    cost = CrossEntropyLoss()
+    epoch = args.epoch_size
+    print('epoch_size is:{}'.format(epoch))
+    
+    if not os.path.exists(args.save_url):
+        os.makedirs(args.save_url, exist_ok=True)
+
+    for _epoch in range(epoch):
+        print('the {} epoch_size begin'.format(_epoch + 1))
+        model.train()
+        for idx, (train_x, train_label) in enumerate(train_loader):
+            train_x = train_x.to(device)
+            train_label = train_label.to(device)
+            label_np = np.zeros((train_label.shape[0], 10))
+            sgd.zero_grad()
+            predict_y = model(train_x.float())
+            loss = cost(predict_y, train_label.long())
+            if idx % 10 == 0:
+                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
+            loss.backward()
+            if DTU_FLAG:
+                dm.optimizer_step(sgd, barrier=True)
+            else:
+                sgd.step()
+            
+            
+        correct = 0
+        _sum = 0
+        model.eval()
+        for idx, (test_x, test_label) in enumerate(test_loader):
+            test_x = test_x
+            test_label = test_label
+            predict_y = model(test_x.to(device).float()).detach()
+            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
+            label_np = test_label.numpy()
+            _ = predict_ys == test_label
+            correct += np.sum(_.numpy(), axis=-1)
+            _sum += _.shape[0]
+        print('accuracy: {:.2f}'.format(correct / _sum))
+        #The model output location is placed under /tmp/output
+        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
+        torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
+        print('test:')
+        print(os.listdir("/tmp/output"))
\ No newline at end of file