Merge pull request 'master' (#11) from master into test_95%

Reviewed-on: http://192.168.207.34:3001/org_wj0815/npu_test/pulls/11
2 years ago · 3bb46499cd
--- a/001.ipynb
+++ b/001.ipynb
@@ -0,0 +1 @@
 hello world
--- a/1.ipynb
+++ b/1.ipynb
@@ -0,0 +1,23 @@
 # Wild type sequence provided in the "Dataset Description":
 wtseq <- 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'
 # Read testing set sequences and pH:
 test <- read.csv('../input/novozymes-enzyme-stability-prediction/test.csv')
 # Add mutation information to testing set:
 test[,c('type','resid','wt','mut')] <- do.call(rbind,lapply(test$protein_sequence,function(seq){
  # case 1 = wild type:
  if(seq==wtseq){ 
    return(c('WT',-1,'_','_'))
  # case 2 = substitution:
  } else if(nchar(seq)==nchar(wtseq)){ 
    i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtseq,""))
    return(c('SUB',i,substr(wtseq,i,i),substr(seq,i,i)))
  # case 3 = deletion:
  } else if(nchar(seq)<nchar(wtseq)){ 
    wtsub <- substr(wtseq,1,nchar(seq))
    i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtsub,""))
    return(c('DEL',i,substr(wtseq,i,i),'_'))
  }
 }))
 head(test)
--- a/1Untitled.ipynb
+++ b/1Untitled.ipynb
@@ -0,0 +1,251 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "90e7b1d4",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'torch'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-e39e8fd52943>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "from __future__ import print_function, division\n",
    "\n",
    "# import sys\n",
    "# sys.path.append('/home/xujiahong/openI_benchmark/vechicle_reID_VechicleNet/')\n",
    "\n",
    "import time\n",
    "import yaml\n",
    "import pickle\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torchvision import datasets,transforms\n",
    "import os\n",
    "import scipy.io\n",
    "from tqdm import tqdm\n",
    "from data_utils.model_train import ft_net\n",
    "from utils.util import get_stream_logger\n",
    "from config.mainconfig import OUTPUT_RESULT_DIR, CONFIG_PATH\n",
    "\n",
    "\n",
    "\n",
    "def fliplr(img):\n",
    "    '''flip horizontal'''\n",
    "    inv_idx = torch.arange(img.size(3)-1,-1,-1).long()  # N x C x H x W\n",
    "    img_flip = img.index_select(3,inv_idx)\n",
    "    return img_flip\n",
    "\n",
    "def extract_feature(model, dataloaders, flip):\n",
    "    features = torch.FloatTensor()\n",
    "    count = 0\n",
    "    for _, data in enumerate(tqdm(dataloaders),0):\n",
    "        img, _ = data\n",
    "        n, c, h, w = img.size()\n",
    "        count += n\n",
    "\n",
    "        input_img = img.cuda()\n",
    "        ff = model(input_img)\n",
    "\n",
    "        if flip:\n",
    "            img = fliplr(img)\n",
    "            input_img = img.cuda()\n",
    "            outputs_flip = model(input_img)\n",
    "            ff += outputs_flip\n",
    "\n",
    "        fnorm = torch.norm(ff, p=2, dim=1, keepdim=True)\n",
    "        ff = ff.div(fnorm.expand_as(ff))\n",
    "        #print(ff.shape)\n",
    "        features = torch.cat((features,ff.data.cpu().float()), 0)\n",
    "        #features = torch.cat((features,ff.data.float()), 0)\n",
    "    return features\n",
    "\n",
    "\n",
    "def get_id(img_path):\n",
    "    '''\n",
    "    xjh: \n",
    "    example of the name of the img: 0769_c013_00074310_0\n",
    "    0769 is the vehicleID, 013 is the cameraID,  00074310 is the frameID\n",
    "    '''\n",
    "    camera_id = []\n",
    "    labels = []\n",
    "    for path, _ in img_path:\n",
    "        #filename = path.split('/')[-1]\n",
    "        filename = os.path.basename(path) #get the name of images\n",
    "        # Test Gallery Image\n",
    "        if not 'c' in filename: \n",
    "            labels.append(9999999)\n",
    "            camera_id.append(9999999)\n",
    "        else:\n",
    "            #label = filename[0:4]\n",
    "            label = filename[0:5] #for benchmark_person\n",
    "            camera = filename.split('c')[1]\n",
    "            if label[0:2]=='-1':\n",
    "                labels.append(-1)\n",
    "            else:\n",
    "                labels.append(int(label))\n",
    "            #camera_id.append(int(camera[0:3]))\n",
    "            camera_id.append(int(camera[0:2]))#for benchmark_person\n",
    "        #print(camera[0:3])\n",
    "    return camera_id, labels\n",
    "\n",
    "\n",
    "def test(config_file_path:str, logger):\n",
    "    #read config files\n",
    "    with open(config_file_path, encoding='utf-8') as f:\n",
    "        opts = yaml.load(f, Loader=yaml.SafeLoader)\n",
    "\n",
    "    data_dir = opts['input']['dataset']['data_dir']\n",
    "    name = \"trained_\" + opts['input']['config']['name']\n",
    "    trained_model_name = name + \"_last.pth\"\n",
    "    save_path = OUTPUT_RESULT_DIR\n",
    "\n",
    "    nclass = opts['input']['config']['nclass']\n",
    "    stride = opts['input']['config']['stride']\n",
    "    pool = opts['input']['config']['pool']\n",
    "    droprate = opts['input']['config']['droprate']\n",
    "    inputsize= opts['input']['config']['inputsize']\n",
    "    w = opts['input']['config']['w']\n",
    "    h = opts['input']['config']['h']\n",
    "    batchsize = opts['input']['config']['batchsize']\n",
    "    flip = opts['test']['flip_test']\n",
    "\n",
    "    trained_model_path = os.path.join(save_path, trained_model_name)\n",
    "\n",
    "    ##############################load model#################################################\n",
    "    ###self-train\n",
    "    model =  ft_net(class_num = nclass, droprate = droprate, stride=stride, init_model=None, pool = pool, return_f=False)\n",
    "    \n",
    "    try:\n",
    "        model.load_state_dict(torch.load(trained_model_path))\n",
    "    except:\n",
    "        model = torch.nn.DataParallel(model)\n",
    "        model.load_state_dict(torch.load(trained_model_path))\n",
    "        model = model.module\n",
    "    model.classifier.classifier = nn.Sequential() #model ends with feature extractor(output len is 512)\n",
    "    # print(model)\n",
    "    \n",
    "    ##############################load dataset###############################################\n",
    "    \n",
    "    #transforms for input image h==w==299, inputsize==256\n",
    "    if h == w:\n",
    "        data_transforms = transforms.Compose([\n",
    "            transforms.Resize( ( round(inputsize*1.1), round(inputsize*1.1)), interpolation=3),\n",
    "            transforms.ToTensor(),\n",
    "            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "        ])\n",
    "    else:\n",
    "        data_transforms = transforms.Compose( [\n",
    "            transforms.Resize((round(h*1.1), round(w*1.1)), interpolation=3), #Image.BICUBIC\n",
    "            transforms.ToTensor(),\n",
    "            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "            ])\n",
    "\n",
    "    image_datasets = {x: datasets.ImageFolder( os.path.join(data_dir,x) ,data_transforms) for x in ['bounding_box_test','query']}\n",
    "    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize,\n",
    "                                             shuffle=False, num_workers=8) for x in ['bounding_box_test','query']}\n",
    "\n",
    "    #############################check GPU###################################################\n",
    "    use_gpu = torch.cuda.is_available()\n",
    "\n",
    "\n",
    "    #############################extract features############################################\n",
    "    # Change to test mode\n",
    "    model = model.eval()\n",
    "    if use_gpu:\n",
    "        model = model.cuda()\n",
    "\n",
    "    gallery_path = image_datasets['bounding_box_test'].imgs\n",
    "    query_path = image_datasets['query'].imgs\n",
    "\n",
    "    gallery_cam,gallery_label = get_id(gallery_path)\n",
    "    query_cam,query_label = get_id(query_path)\n",
    "\n",
    "\n",
    "    gallery_label = np.asarray(gallery_label)\n",
    "    query_label = np.asarray(query_label)\n",
    "    gallery_cam = np.asarray(gallery_cam)\n",
    "    query_cam = np.asarray(query_cam)\n",
    "    print('Gallery Size: %d'%len(gallery_label))\n",
    "    print('Query Size: %d'%len(query_label))\n",
    "    # Extract feature\n",
    "    since = time.time()\n",
    "    with torch.no_grad():\n",
    "        gallery_feature = extract_feature(model, dataloaders['bounding_box_test'], flip)\n",
    "        query_feature = extract_feature(model, dataloaders['query'], flip)\n",
    "    process_time = time.time() - since\n",
    "    logger.info('total forward time: %.2f minutes'%(process_time/60))\n",
    "    \n",
    "    dist = 1-torch.mm(query_feature, torch.transpose(gallery_feature, 0, 1))\n",
    "\n",
    "    # Save to Matlab for check\n",
    "    extracted_feature = {'gallery_feature': gallery_feature.numpy(), 'gallery_label':gallery_label, 'gallery_cam':gallery_cam, \\\n",
    "                        'query_feature': query_feature.numpy(), 'query_label':query_label, 'query_cam':query_cam}\n",
    "\n",
    "    result_name = os.path.join(save_path, name+'_feature.mat')\n",
    "    scipy.io.savemat(result_name, extracted_feature)        \n",
    "\n",
    "    return_dict = {}\n",
    "\n",
    "    return_dict['dist'] = dist.numpy()\n",
    "    return_dict['feature_example'] = query_feature[0].numpy()\n",
    "    return_dict['gallery_label'] = gallery_label\n",
    "    return_dict['gallery_cam'] = gallery_cam\n",
    "    return_dict['query_label'] = query_label\n",
    "    return_dict['query_cam'] = query_cam\n",
    "\n",
    "    pickle.dump(return_dict, open(OUTPUT_RESULT_DIR+'test_result.pkl', 'wb'), protocol=4)\n",
    "\n",
    "    return \n",
    "\n",
    "    # eval_result = evaluator(result, logger)\n",
    "    # full_table = display_eval_result(dict = eval_result)\n",
    "    # logger.info(full_table)\n",
    "\n",
    "if __name__==\"__main__\":\n",
    "    logger = get_stream_logger('TEST')\n",
    "    test(CONFIG_PATH, logger)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c27b171e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MindSpore",
   "language": "python",
   "name": "mindspore"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/c2net_listdata.py
+++ b/c2net_listdata.py
@@ -0,0 +1,114 @@
 """
 ######################## train lenet example ########################
 train lenet and get network model files(.ckpt) 
 """
 #!/usr/bin/python
 #coding=utf-8
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image ###  
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0]
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
            #unzip dataset
            os.system("unzip -d %s %s" % (file_path, path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        MultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            MultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--multi_data_url',
                    help='dataset path in obs')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/dataset'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    print("--------start ls:")
    os.system("cd /cache/dataset; ls -al")
    print("--------end ls-----------")
--- a/dummyFolder/1.py
+++ b/dummyFolder/1.py
@@ -0,0 +1 @@
 print('hello world')
--- a/gpu/pretrain.py
+++ b/gpu/pretrain.py
@@ -0,0 +1,128 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 1，The dataset structure of the single-dataset in this example
 MnistDataset_torch.zip
  ├── test
  └── train  
 2，Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /dataset/train, /dataset/test;
   If it is a multiple dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test;
 (2)If the pre-training model file is selected, the selected pre-training model will be 
 automatically placed in the /pretrainmodel directory.
 for example:
    If the model file is selected, the calling method is: '/pretrainmodel/' + args.pretrainmodelname
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 #获取模型文件名称
 parser.add_argument('--ckpt_url', default="", help='pretrain model path')
 # 参数声明
 WORKERS = 0   # dataloder线程数
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 cost = CrossEntropyLoss()
 # 模型训练
 def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = cost(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item()))
 # 模型测试
 def test(model, test_loader, test_data):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += cost(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))
 def main():
    # 如果有保存的模型，则加载模型，并在其基础上继续训练
    if os.path.exists(args.ckpt_url):
        checkpoint = torch.load(args.ckpt_url)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载 epoch {} 权重成功！'.format(start_epoch))
    else:
        start_epoch = 0
        print('无保存模型，将从头开始训练！')
    for epoch in range(start_epoch+1, epochs):
        train(model, train_loader, epoch)
        test(model, test_loader, test_dataset)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/model/mnist_epoch{}.pkl'.format(epoch))
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    epochs = args.epoch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    main()
--- a/gpu/pretrain_for_c2net.py
+++ b/gpu/pretrain_for_c2net.py
@@ -0,0 +1,144 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 In the training environment, 
 (1)the code will be automatically placed in the /tmp/code directory, 
 (2)the uploaded dataset will be automatically placed in the /tmp/dataset directory
 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /tmp/dataset/train, /dataset/test;
 The dataset structure of the single dataset in the training image in this example:
  tmp
    ├──dataset 
         ├── test
         └── train 
 If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
 the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
 and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The dataset structure in the training image for multiple datasets in this example:
 tmp
  ├──dataset
     ├── MnistDataset_torch
     |     ├── test
     |     └── train 
     └── checkpoint_epoch1_0.73 
           ├── mnist_epoch1_0.73.pkl
 (3)the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, 
 qizhi platform will provide file downloads under the /tmp/output directory.
 (4)If the pre-training model file is selected, the selected pre-training model will be 
 automatically placed in the /tmp/pretrainmodel directory.
 for example:
    If the model file is selected, the calling method is: '/pretrainmodel/' + args.pretrainmodelname
 In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, 
 which is written as: 
 import os
 os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 #获取模型文件名称
 parser.add_argument('--ckpt_url', default="", help='pretrain model path')
 # 参数声明
 WORKERS = 0   # dataloder线程数
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 cost = CrossEntropyLoss()
 # 模型训练
 def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = cost(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item()))
 # 模型测试
 def test(model, test_loader, test_data):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += cost(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))
 def main():
    # 如果有保存的模型，则加载模型，并在其基础上继续训练
    if os.path.exists(args.ckpt_url):
        checkpoint = torch.load(args.ckpt_url)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载 epoch {} 权重成功！'.format(start_epoch))
    else:
        start_epoch = 0
        print('无保存模型，将从头开始训练！')
    for epoch in range(start_epoch+1, epochs):
        train(model, train_loader, epoch)
        test(model, test_loader, test_dataset)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/tmp/output/mnist_epoch{}.pkl'.format(epoch))
        #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi
        os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    epochs = args.epoch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    main()
--- a/gpu/train.py
+++ b/gpu/train.py
@@ -30,6 +30,7 @@ from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import datetime
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
@@ -39,10 +40,14 @@ parser.add_argument('--testdata', default="/dataset/test" ,help='path to test da
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 def gettime():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return timestr
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
@@ -53,9 +58,9 @@ if __name__ == '__main__':
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print('epoch_size is:{}'.format(epoch))
    print(gettime(), 'epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print('the {} epoch_size begin'.format(_epoch + 1))
        print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
@@ -64,8 +69,10 @@ if __name__ == '__main__':
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            #if idx % 10 == 0:
                #print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            print(gettime())
            print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
@@ -81,6 +88,6 @@ if __name__ == '__main__':
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print('accuracy: {:.2f}'.format(correct / _sum))
        print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
--- a/gpu/train_continue.py
+++ b/gpu/train_continue.py
@@ -0,0 +1,121 @@
 #####################################################################################################
 # 继续训练功能：修改训练任务时，若勾选复用上次结果，则可在新训练任务的输出路径中读取到上次结果
 #
 # 示例用法
 # - 增加两个训练参数 
 #    'ckpt_save_name' 此次任务的输出文件名称 
 #    'ckpt_load_name' 上一次任务的输出文件名，用于加载上一次输出的模型文件名称，默认为空，则不读取任何文件
 # - 训练代码中判断 'ckpt_load_name' 是否为空，若不为空，则为继续训练任务
 #####################################################################################################
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 #获取预训练模型文件名称
 parser.add_argument('--ckpt_url', default="", help='pretrain model path')
 #继续训练模型文件名称
 parser.add_argument('--ckpt_save_name', default="", help='save model name')
 parser.add_argument('--ckpt_load_name', default="", help='load model name') 
 # 参数声明
 WORKERS = 0   # dataloder线程数
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 cost = CrossEntropyLoss()
 # 模型训练
 def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = cost(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item()))
 # 模型测试
 def test(model, test_loader, test_data):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += cost(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))
 def main():
    base_path = "/model" # 若使用智算集群则修改为 "/tmp/output"
    # 预训练模型加载，限制只在第一次任务生效，则 args.ckpt_load_name为空时
    if os.path.exists(args.ckpt_url) and not args.ckpt_load_name:
        checkpoint = torch.load(args.ckpt_url)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载预训练模型 epoch {} 权重成功！'.format(start_epoch))
    # 继续训练模型加载，需要先行任务有输出文件
    elif args.ckpt_load_name:
        load_path = "{}/{}.pkl".format(base_path, args.ckpt_load_name)
        checkpoint = torch.load(load_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载继续训练 epoch {} 权重成功！'.format(start_epoch))
    else:
        print('无保存模型，将从头开始训练！')
    for epoch in range(epochs):
        train(model, train_loader, epoch)
        test(model, test_loader, test_dataset)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        save_path = "{}/{}.pkl".format(base_path, args.ckpt_save_name)
        torch.save(state, save_path)  
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    epochs = args.epoch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    main()
--- a/gpu/train_continue_c2net.py
+++ b/gpu/train_continue_c2net.py
@@ -0,0 +1,122 @@
 #####################################################################################################
 # 继续训练功能：修改训练任务时，若勾选复用上次结果，则可在新训练任务的输出路径中读取到上次结果
 #
 # 示例用法
 # - 增加两个训练参数 
 #    'ckpt_save_name' 此次任务的输出文件名称 
 #    'ckpt_load_name' 上一次任务的输出文件名，用于加载上一次输出的模型文件名称，默认为空，则不读取任何文件
 # - 训练代码中判断 'ckpt_load_name' 是否为空，若不为空，则为继续训练任务
 #####################################################################################################
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 #获取预训练模型文件名称
 parser.add_argument('--ckpt_url', default="", help='pretrain model path')
 #继续训练模型文件名称
 parser.add_argument('--ckpt_save_name', default="", help='save model name')
 parser.add_argument('--ckpt_load_name', default="", help='load model name') 
 # 参数声明
 WORKERS = 0   # dataloder线程数
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 cost = CrossEntropyLoss()
 # 模型训练
 def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = cost(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item()))
 # 模型测试
 def test(model, test_loader, test_data):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += cost(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))
 def main():
    base_path = "/tmp/output" # 若使用智算集群则修改为 "/tmp/output"
    # 预训练模型加载，限制只在第一次任务生效，则 args.ckpt_load_name为空时
    if os.path.exists(args.ckpt_url) and not args.ckpt_load_name:
        checkpoint = torch.load(args.ckpt_url)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载预训练模型 epoch {} 权重成功！'.format(start_epoch))
    # 继续训练模型加载，需要先行任务有输出文件
    elif args.ckpt_load_name:
        load_path = "{}/{}.pkl".format(base_path, args.ckpt_load_name)
        checkpoint = torch.load(load_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载继续训练 epoch {} 权重成功！'.format(start_epoch))
    else:
        print('无保存模型，将从头开始训练！')
    for epoch in range(epochs):
        train(model, train_loader, epoch)
        test(model, test_loader, test_dataset)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        save_path = "{}/{}.pkl".format(base_path, args.ckpt_save_name)
        torch.save(state, save_path)  
    os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    epochs = args.epoch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    main()
--- a/gpu/train_fail.py
+++ b/gpu/train_fail.py
@@ -0,0 +1,2 @@
 import aaaa
 print('test failure, no module')
--- a/gpu/train_fail2.py
+++ b/gpu/train_fail2.py
@@ -0,0 +1,93 @@
 #!/usr/bin/python
 #coding=utf-8    
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8    
 Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 If it is a single dataset: 
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
 If it is a multiple dataset: 
 If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import datetime
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 def gettime():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return timestr
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print(gettime(), 'epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
        print("----------this is the end--------")
        print("abc"
--- a/gpu/train_fail3.py
+++ b/gpu/train_fail3.py
@@ -0,0 +1,94 @@
 #!/usr/bin/python
 #coding=utf-8    
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8    
 Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 If it is a single dataset: 
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
 If it is a multiple dataset: 
 If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import datetime
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 def gettime():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return timestr
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print(gettime(), 'epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            #if idx % 10 == 0:
                #print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item()))
            print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
        print("----------this is the end--------")
        print(a)
--- a/gpu/train_log.py
+++ b/gpu/train_log.py
@@ -0,0 +1,93 @@
 #!/usr/bin/python
 #coding=utf-8    
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8    
 Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 If it is a single dataset: 
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
 If it is a multiple dataset: 
 If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import datetime
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 def gettime():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return timestr
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print(gettime(), 'epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item()))
            if idx % 10 == 0:
                print("------------------")
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
--- a/gpu_new/README.md
+++ b/gpu_new/README.md
@@ -0,0 +1,112 @@
 # 如何在启智平台上进行模型训练 - GPU版本
 - 启智集群单数据集的训练，启智集群多数据集的训练，智算集群的单数据集训练，这3个的训练使用方式不同，请注意区分：
  - 启智集群单数据集的训练示例请参考示例中[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train.py)的代码注释
  - 启智集群单数据集**加载模型**的训练示例请参考示例中[pretrain.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/pretrain.py)的代码注释
  - 启智集群多数据集的训练示例请参考示例中[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_multidataset.py)的代码注释
  - 智算集群单数据集的训练示例请参考示例中[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_c2net.py)的代码注释
  - 智算集群单数据集**加载模型**的训练示例请参考示例中[pretrain_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/pretrain_for_c2net.py)的代码注释
 - 启智集群中单数据集和多数据集的区别在于使用方式不同：
  如本示例中单数据集MNISTDataset_torch.zip的使用方式是:数据集位于/dataset/下
  多数据集时MNISTDataset_torch.zip的使用方式是：数据集位于/dataset/MNISTDataset_torch/下
 - 智算网络中，若需要在每个epoch后都返回训练结果，可以使用回传工具将/tmp/output文件夹的内容及时传到启智以供下载，具体写法为：
  ```
  os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
  ```
 ## 1 概述
 - 本项目以#LeNet5-MNIST-PyTorch为例，简要介绍如何在启智AI协同平台上使用Pytorch完成训练任务，包括单数据集的训练，多数据集的训练，智算网络的训练，旨在为AI开发者提供启智训练示例。
 - 用户可以直接使用本项目提供的数据集和代码文件创建自己的训练任务。
 ## 2 准备工作
 - 启智平台使用准备，本项目需要用户创建启智平台账户，克隆代码到自己的账户，上传数据集，具体操作方法可以通过访问[OpenI_Learning](https://git.openi.org.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。
 ### 2.1 数据准备
 #### 数据集获取
 - 如果你需要试运行本示例，则无需再次上传数据集，因为本示例中的数据集MnistDataset_torch.zip已经设置为公开数据集，可以直接引用,数据集也可从本项目的数据集目录中下载并查看数据结构，[MNISTDataset_torch.zip数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0),[mnist_epoch1_0.73.pkl.zip数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0)。
 - 数据文件说明
  - MNISTData数据集是由10类28∗28的灰度图片组成，训练数据集包含60000张图片，测试数据集包含10000张图片。
  - 数据集压缩包的目录结构如下：
    > MNISTDataset_torch.zip
    > ├── test
    > │     └── MNIST
    > │           │── raw
    > │           │    ├── t10k-images-idx3-ubyte
    > │           │    └── t10k-labels-idx1-ubyte
    > │           │    ├── train-images-idx3-ubyte
    > │	        │    └── train-labels-idx1-ubyte
    > │           └── processed
    > │                ├── test.pt
    > │	             └── training.pt
    > └── train
    > └── MNIST
    > │── raw
    > │    ├── t10k-images-idx3-ubyte
    > │    └── t10k-labels-idx1-ubyte
    > │    ├── train-images-idx3-ubyte
    > │    └── train-labels-idx1-ubyte
    > └── processed
    > ├── test.pt
    > └── training.pt
    >
    > mnist_epoch1_0.73.pkl.zip
    > ├── mnist_epoch1_0.73.pkl
    >
 #### 数据集上传
 使用GPU进行训练，需要在GPU芯片上运行，所以上传的数据集需要传到GPU界面。(此步骤在本示例中不需要，可直接选择公开数据集MNISTDataset_torch.zip)
 ### 2.2 执行脚本准备
 #### 示例代码
 - 示例代码可从本仓库中下载，[代码下载](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU)
 - 代码文件说明
  - [train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train.py)，用于单数据集训练的脚本文件。具体说明请参考[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train.py)
  - [train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_multidataset.py)，用于多数据集训练的脚本文件。具体说明请参考[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_multidataset.py)
  - [train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_c2net.py)，用于智算网络训练的脚本文件。具体说明请参考[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_c2net.py)
  - [model.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/model.py)，使用的训练网络，在单数据集训练，多数据集训练，智算网络训练中使用到。
 ## 3 创建训练任务
 准备好数据和执行脚本以后，需要创建训练任务将Pytorch脚本运行。首次使用的用户可参考本示例代码。
 ### 训练界面示例
 由于A100的适配性问题，A100需要使用cuda11以上的cuda版本，目前平台已提供基于A100的cuda基础镜像，只需要选择对应的公共镜像：
 ![avatar](Example_picture/适用A100的基础镜像.png)
 训练界面参数参考如下：
 ![avatar](Example_picture/基础镜像.png)
 表1 创建训练作业界面参数说明
 | 参数名称 | 说明                                                                                                                                                                 |
 | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 计算资源 | 选择CPU/GPU                                                                                                                                                          |
 | 代码分支 | 选择仓库代码中要使用的代码分支，默认可选择master分支                                                                                                                 |
 | 镜像     | 镜像选择已在调试环境中调试好的镜像，目前版本请选择基础镜像：平台提供基于A100的cuda基础镜像，如dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 |
 | 启动文件 | 启动文件选择代码目录下的启动脚本train.py                                                                                                                             |
 | 数据集   | 数据集选择已上传到启智平台的公共数据集MnistDataset_torch.zip                                                                                                         |
 | 运行参数 | 增加运行参数可以向脚本中其他参数传值，如epoch_size                                                                                                                   |
 | 资源规格 | 规格选择含有GPU个数的规格                                                                                                                                            |
 ## 4 查看运行结果
 ### 4.1 在训练作业界面可以查看运行日志
 目前训练任务的日志只能在代码中print输出，参考示例train.py代码相关print
 ### 4.2 训练结束后可以下载模型文件
 ![avatar](Example_picture/结果下载.png)
 ## 对于示例代码有任何问题，欢迎在本项目中提issue。
--- a/gpu_new/inference.py
+++ b/gpu_new/inference.py
@@ -0,0 +1,76 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 GPU INFERENCE  INSTANCE
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8
 Due to the adaptability of a100, please use the recommended image of the
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the environment, the uploaded dataset will be automatically placed in the /dataset directory.
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/test;
 The model file selected  is in /model  directory.
 The result download path is under /result . and the Qizhi platform will provide file downloads under the /result directory.
 本例中的镜像是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 选择的数据集被放置在/dataset目录
 选择的模型文件放置在/model目录
 输出结果路径是/result目录
 ！！！注意：目前推理的资源环境不支持联网，所以镜像无法使用公网镜像，镜像必须先提交到启智平台;推理的数据集也需要先上传到启智平台
 '''
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import os
 import argparse
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #获取模型文件名称
 parser.add_argument('--modelname',  help='model name')
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    print('cuda is available:{}'.format(torch.cuda.is_available()))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_dataset = mnist.MNIST(root='/dataset/test', train=False, transform=ToTensor(),
                               download=False)
    test_loader = DataLoader(test_dataset, batch_size=256)
    #如果文件名确定，model_path可以直接写死
    model_path = '/model/'+args.modelname
    model = torch.load(model_path).to(device)
    model.eval()
    correct = 0
    _sum = 0
    for idx, (test_x, test_label) in enumerate(test_loader):
        test_x = test_x
        test_label = test_label
        predict_y = model(test_x.to(device).float()).detach()
        predict_ys = np.argmax(predict_y.cpu(), axis=-1)
        label_np = test_label.numpy()
        _ = predict_ys == test_label
        correct += np.sum(_.numpy(), axis=-1)
        _sum += _.shape[0]
    print('accuracy: {:.2f}'.format(correct / _sum))
    #结果写入/result
    filename = 'result.txt'
    file_path = os.path.join('/result', filename)
    with open(file_path, 'w') as file:
        file.write('accuracy: {:.2f}'.format(correct / _sum))
--- a/gpu_new/model.py
+++ b/gpu_new/model.py
@@ -0,0 +1,35 @@
 from torch.nn import Module
 from torch import nn
 class Model(Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(256, 120)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(120, 84)
        self.relu4 = nn.ReLU()
        self.fc3 = nn.Linear(84, 10)
        self.relu5 = nn.ReLU()
    def forward(self, x):
        y = self.conv1(x)
        y = self.relu1(y)
        y = self.pool1(y)
        y = self.conv2(y)
        y = self.relu2(y)
        y = self.pool2(y)
        y = y.view(y.shape[0], -1)
        y = self.fc1(y)
        y = self.relu3(y)
        y = self.fc2(y)
        y = self.relu4(y)
        y = self.fc3(y)
        y = self.relu5(y)
        return y
--- a/gpu_new/pretrain.py
+++ b/gpu_new/pretrain.py
@@ -0,0 +1,125 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 1，The dataset structure of the single-dataset in this example
 MnistDataset_torch.zip
  ├── test
  └── train  
 2，Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /dataset/train, /dataset/test;
   If it is a multiple dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test;
 (2)If the pre-training model file is selected, the selected pre-training model path save as parameter ckpt_url;
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 #获取模型文件名称
 parser.add_argument('--ckpt_url', default="", help='pretrain model path')
 # 参数声明
 WORKERS = 0   # dataloder线程数
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 cost = CrossEntropyLoss()
 # 模型训练
 def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = cost(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item()))
 # 模型测试
 def test(model, test_loader, test_data):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += cost(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))
 def main():
    # 如果有保存的模型，则加载模型，并在其基础上继续训练
    if os.path.exists(args.ckpt_url):
        checkpoint = torch.load(args.ckpt_url)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载 epoch {} 权重成功！'.format(start_epoch))
    else:
        start_epoch = 0
        print('无保存模型，将从头开始训练！')
    for epoch in range(start_epoch+1, epochs):
        train(model, train_loader, epoch)
        test(model, test_loader, test_dataset)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/model/mnist_epoch{}.pkl'.format(epoch))
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    epochs = args.epoch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    main()
--- a/gpu_new/pretrain_for_c2net.py
+++ b/gpu_new/pretrain_for_c2net.py
@@ -0,0 +1,141 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 In the training environment, 
 (1)the code will be automatically placed in the /tmp/code directory, 
 (2)the uploaded dataset will be automatically placed in the /tmp/dataset directory
 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /tmp/dataset/train, /dataset/test;
 The dataset structure of the single dataset in the training image in this example:
  tmp
    ├──dataset 
         ├── test
         └── train 
 If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
 the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
 and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The dataset structure in the training image for multiple datasets in this example:
 tmp
  ├──dataset
     ├── MnistDataset_torch
     |     ├── test
     |     └── train 
     └── checkpoint_epoch1_0.73 
           ├── mnist_epoch1_0.73.pkl
 (3)the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, 
 qizhi platform will provide file downloads under the /tmp/output directory.
 (4)If the pre-training model file is selected, the selected pre-training model path save as parameter ckpt_url;
 In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, 
 which is written as: 
 import os
 os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 #获取模型文件名称
 parser.add_argument('--ckpt_url', default="", help='pretrain model path')
 # 参数声明
 WORKERS = 0   # dataloder线程数
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 cost = CrossEntropyLoss()
 # 模型训练
 def train(model, train_loader, epoch):
    model.train()
    train_loss = 0
    for i, data in enumerate(train_loader, 0):
        x, y = data
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = cost(y_hat, y)
        loss.backward()
        optimizer.step()
        train_loss += loss
    loss_mean = train_loss / (i+1)
    print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item()))
 # 模型测试
 def test(model, test_loader, test_data):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader, 0):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            y_hat = model(x)
            test_loss += cost(y_hat, y).item()
            pred = y_hat.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()
        test_loss /= (i+1)
        print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_data), 100. * correct / len(test_data)))
 def main():
    # 如果有保存的模型，则加载模型，并在其基础上继续训练
    if os.path.exists(args.ckpt_url):
        checkpoint = torch.load(args.ckpt_url)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        start_epoch = checkpoint['epoch']
        print('加载 epoch {} 权重成功！'.format(start_epoch))
    else:
        start_epoch = 0
        print('无保存模型，将从头开始训练！')
    for epoch in range(start_epoch+1, epochs):
        train(model, train_loader, epoch)
        test(model, test_loader, test_dataset)
        # 保存模型
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/tmp/output/mnist_epoch{}.pkl'.format(epoch))
        #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi
        os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    epochs = args.epoch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    main()
--- a/gpu_new/test_inference.py
+++ b/gpu_new/test_inference.py
@@ -0,0 +1,80 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 GPU INFERENCE  INSTANCE
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8
 Due to the adaptability of a100, please use the recommended image of the
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the environment, the uploaded dataset will be automatically placed in the /dataset directory.
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/test;
 The model file selected  is in /model  directory.
 The result download path is under /result . and the Qizhi platform will provide file downloads under the /result directory.
 本例中的镜像是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 选择的数据集被放置在/dataset目录
 选择的模型文件放置在/model目录
 输出结果路径是/result目录
 ！！！注意：目前推理的资源环境不支持联网，所以镜像无法使用公网镜像，镜像必须先提交到启智平台;推理的数据集也需要先上传到启智平台
 '''
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import os
 import argparse
 from model import Model
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #获取模型文件名称
 parser.add_argument('--modelname',  help='model name')
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    print('cuda is available:{}'.format(torch.cuda.is_available()))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    test_dataset = mnist.MNIST(root='/dataset/test', train=False, transform=ToTensor(),
                               download=False)
    test_loader = DataLoader(test_dataset, batch_size=256)
    #如果文件名确定，model_path可以直接写死
    model_path = '/model/'+args.modelname
    model = Model().to(device)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model'])
    model.eval()
    correct = 0
    _sum = 0
    for idx, (test_x, test_label) in enumerate(test_loader):
        test_x = test_x
        test_label = test_label
        predict_y = model(test_x.to(device).float()).detach()
        predict_ys = np.argmax(predict_y.cpu(), axis=-1)
        label_np = test_label.numpy()
        _ = predict_ys == test_label
        correct += np.sum(_.numpy(), axis=-1)
        _sum += _.shape[0]
    print('accuracy: {:.2f}'.format(correct / _sum))
    #结果写入/result
    filename = 'result.txt'
    file_path = os.path.join('/result', filename)
    with open(file_path, 'w') as file:
        file.write('accuracy: {:.2f}'.format(correct / _sum))
--- a/gpu_new/train.py
+++ b/gpu_new/train.py
@@ -0,0 +1,92 @@
 #!/usr/bin/python
 #coding=utf-8    
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8    
 Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 If it is a single dataset: 
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
 If it is a multiple dataset: 
 If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 # 参数声明
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model = Model().to(device)
 optimizer = SGD(model.parameters(), lr=1e-1)
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print('epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        #print('the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print('accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
--- a/gpu_new/train_for_c2net.py
+++ b/gpu_new/train_for_c2net.py
@@ -0,0 +1,111 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 In the training environment, 
 the code will be automatically placed in the /tmp/code directory, 
 the uploaded dataset will be automatically placed in the /tmp/dataset directory
 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /tmp/dataset/train, /dataset/test;
 The dataset structure of the single dataset in the training image in this example:
  tmp
    ├──dataset 
         ├── test
         └── train 
 If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
 the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test
 and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The dataset structure in the training image for multiple datasets in this example:
 tmp
  ├──dataset
     ├── MnistDataset_torch
     |     ├── test
     |     └── train 
     └── checkpoint_epoch1_0.73 
           ├── mnist_epoch1_0.73.pkl
 the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, 
 qizhi platform will provide file downloads under the /tmp/output directory.
 In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, 
 which is written as: 
 import os
 os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import os
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print('epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print('the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print('accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
        torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
        #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi
        os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/")
--- a/gpu_new/train_for_multidataset.py
+++ b/gpu_new/train_for_multidataset.py
@@ -0,0 +1,113 @@
 #!/usr/bin/python
 #coding=utf-8
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 1，The dataset structure of the multi-dataset in this example
 MnistDataset_torch.zip
  ├── test
  └── train  
 checkpoint_epoch1_0.73.zip
  ├── mnist_epoch1_0.73.pkl
 2，Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 Note: the paths are different when selecting a single dataset and multiple datasets.
 (1)If it is a single dataset: if MnistDataset_torch.zip is selected, 
   the dataset directory is /dataset/train, /dataset/test;
 The dataset structure of the single dataset in the training image in this example:
  dataset
   ├── test
   └── train 
 (2)If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The dataset structure in the training image for multiple datasets in this example:
  dataset
   ├── MnistDataset_torch
   |     ├── test
   |     └── train 
   └── checkpoint_epoch1_0.73 
         ├── mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/MnistDataset_torch/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/MnistDataset_torch/test" ,help='path to test dataset')
 parser.add_argument('--checkpoint', default="/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl" ,help='checkpoint file')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print('cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print('epoch_size is:{}'.format(epoch))
    # Load the trained model
    # path = args.checkpoint
    # checkpoint = torch.load(path, map_location=device)
    # model.load_state_dict(checkpoint)
    for _epoch in range(epoch):
        print('the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print('idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print('accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
--- a/notebooks/1.md
+++ b/notebooks/1.md
@@ -0,0 +1 @@
 hello
--- a/notebooks/testwj.ipynb
+++ b/notebooks/testwj.ipynb
@@ -0,0 +1,35 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14e5e20a",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('hello world')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MindSpore",
   "language": "python",
   "name": "mindspore"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/test测试/1.txt
+++ b/notebooks/test测试/1.txt
@@ -0,0 +1 @@
 well
--- a/notebooks/test测试/wj-Untitled.ipynb
+++ b/notebooks/test测试/wj-Untitled.ipynb
@@ -0,0 +1,251 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "90e7b1d4",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'torch'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-e39e8fd52943>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "from __future__ import print_function, division\n",
    "\n",
    "# import sys\n",
    "# sys.path.append('/home/xujiahong/openI_benchmark/vechicle_reID_VechicleNet/')\n",
    "\n",
    "import time\n",
    "import yaml\n",
    "import pickle\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torchvision import datasets,transforms\n",
    "import os\n",
    "import scipy.io\n",
    "from tqdm import tqdm\n",
    "from data_utils.model_train import ft_net\n",
    "from utils.util import get_stream_logger\n",
    "from config.mainconfig import OUTPUT_RESULT_DIR, CONFIG_PATH\n",
    "\n",
    "\n",
    "\n",
    "def fliplr(img):\n",
    "    '''flip horizontal'''\n",
    "    inv_idx = torch.arange(img.size(3)-1,-1,-1).long()  # N x C x H x W\n",
    "    img_flip = img.index_select(3,inv_idx)\n",
    "    return img_flip\n",
    "\n",
    "def extract_feature(model, dataloaders, flip):\n",
    "    features = torch.FloatTensor()\n",
    "    count = 0\n",
    "    for _, data in enumerate(tqdm(dataloaders),0):\n",
    "        img, _ = data\n",
    "        n, c, h, w = img.size()\n",
    "        count += n\n",
    "\n",
    "        input_img = img.cuda()\n",
    "        ff = model(input_img)\n",
    "\n",
    "        if flip:\n",
    "            img = fliplr(img)\n",
    "            input_img = img.cuda()\n",
    "            outputs_flip = model(input_img)\n",
    "            ff += outputs_flip\n",
    "\n",
    "        fnorm = torch.norm(ff, p=2, dim=1, keepdim=True)\n",
    "        ff = ff.div(fnorm.expand_as(ff))\n",
    "        #print(ff.shape)\n",
    "        features = torch.cat((features,ff.data.cpu().float()), 0)\n",
    "        #features = torch.cat((features,ff.data.float()), 0)\n",
    "    return features\n",
    "\n",
    "\n",
    "def get_id(img_path):\n",
    "    '''\n",
    "    xjh: \n",
    "    example of the name of the img: 0769_c013_00074310_0\n",
    "    0769 is the vehicleID, 013 is the cameraID,  00074310 is the frameID\n",
    "    '''\n",
    "    camera_id = []\n",
    "    labels = []\n",
    "    for path, _ in img_path:\n",
    "        #filename = path.split('/')[-1]\n",
    "        filename = os.path.basename(path) #get the name of images\n",
    "        # Test Gallery Image\n",
    "        if not 'c' in filename: \n",
    "            labels.append(9999999)\n",
    "            camera_id.append(9999999)\n",
    "        else:\n",
    "            #label = filename[0:4]\n",
    "            label = filename[0:5] #for benchmark_person\n",
    "            camera = filename.split('c')[1]\n",
    "            if label[0:2]=='-1':\n",
    "                labels.append(-1)\n",
    "            else:\n",
    "                labels.append(int(label))\n",
    "            #camera_id.append(int(camera[0:3]))\n",
    "            camera_id.append(int(camera[0:2]))#for benchmark_person\n",
    "        #print(camera[0:3])\n",
    "    return camera_id, labels\n",
    "\n",
    "\n",
    "def test(config_file_path:str, logger):\n",
    "    #read config files\n",
    "    with open(config_file_path, encoding='utf-8') as f:\n",
    "        opts = yaml.load(f, Loader=yaml.SafeLoader)\n",
    "\n",
    "    data_dir = opts['input']['dataset']['data_dir']\n",
    "    name = \"trained_\" + opts['input']['config']['name']\n",
    "    trained_model_name = name + \"_last.pth\"\n",
    "    save_path = OUTPUT_RESULT_DIR\n",
    "\n",
    "    nclass = opts['input']['config']['nclass']\n",
    "    stride = opts['input']['config']['stride']\n",
    "    pool = opts['input']['config']['pool']\n",
    "    droprate = opts['input']['config']['droprate']\n",
    "    inputsize= opts['input']['config']['inputsize']\n",
    "    w = opts['input']['config']['w']\n",
    "    h = opts['input']['config']['h']\n",
    "    batchsize = opts['input']['config']['batchsize']\n",
    "    flip = opts['test']['flip_test']\n",
    "\n",
    "    trained_model_path = os.path.join(save_path, trained_model_name)\n",
    "\n",
    "    ##############################load model#################################################\n",
    "    ###self-train\n",
    "    model =  ft_net(class_num = nclass, droprate = droprate, stride=stride, init_model=None, pool = pool, return_f=False)\n",
    "    \n",
    "    try:\n",
    "        model.load_state_dict(torch.load(trained_model_path))\n",
    "    except:\n",
    "        model = torch.nn.DataParallel(model)\n",
    "        model.load_state_dict(torch.load(trained_model_path))\n",
    "        model = model.module\n",
    "    model.classifier.classifier = nn.Sequential() #model ends with feature extractor(output len is 512)\n",
    "    # print(model)\n",
    "    \n",
    "    ##############################load dataset###############################################\n",
    "    \n",
    "    #transforms for input image h==w==299, inputsize==256\n",
    "    if h == w:\n",
    "        data_transforms = transforms.Compose([\n",
    "            transforms.Resize( ( round(inputsize*1.1), round(inputsize*1.1)), interpolation=3),\n",
    "            transforms.ToTensor(),\n",
    "            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "        ])\n",
    "    else:\n",
    "        data_transforms = transforms.Compose( [\n",
    "            transforms.Resize((round(h*1.1), round(w*1.1)), interpolation=3), #Image.BICUBIC\n",
    "            transforms.ToTensor(),\n",
    "            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "            ])\n",
    "\n",
    "    image_datasets = {x: datasets.ImageFolder( os.path.join(data_dir,x) ,data_transforms) for x in ['bounding_box_test','query']}\n",
    "    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize,\n",
    "                                             shuffle=False, num_workers=8) for x in ['bounding_box_test','query']}\n",
    "\n",
    "    #############################check GPU###################################################\n",
    "    use_gpu = torch.cuda.is_available()\n",
    "\n",
    "\n",
    "    #############################extract features############################################\n",
    "    # Change to test mode\n",
    "    model = model.eval()\n",
    "    if use_gpu:\n",
    "        model = model.cuda()\n",
    "\n",
    "    gallery_path = image_datasets['bounding_box_test'].imgs\n",
    "    query_path = image_datasets['query'].imgs\n",
    "\n",
    "    gallery_cam,gallery_label = get_id(gallery_path)\n",
    "    query_cam,query_label = get_id(query_path)\n",
    "\n",
    "\n",
    "    gallery_label = np.asarray(gallery_label)\n",
    "    query_label = np.asarray(query_label)\n",
    "    gallery_cam = np.asarray(gallery_cam)\n",
    "    query_cam = np.asarray(query_cam)\n",
    "    print('Gallery Size: %d'%len(gallery_label))\n",
    "    print('Query Size: %d'%len(query_label))\n",
    "    # Extract feature\n",
    "    since = time.time()\n",
    "    with torch.no_grad():\n",
    "        gallery_feature = extract_feature(model, dataloaders['bounding_box_test'], flip)\n",
    "        query_feature = extract_feature(model, dataloaders['query'], flip)\n",
    "    process_time = time.time() - since\n",
    "    logger.info('total forward time: %.2f minutes'%(process_time/60))\n",
    "    \n",
    "    dist = 1-torch.mm(query_feature, torch.transpose(gallery_feature, 0, 1))\n",
    "\n",
    "    # Save to Matlab for check\n",
    "    extracted_feature = {'gallery_feature': gallery_feature.numpy(), 'gallery_label':gallery_label, 'gallery_cam':gallery_cam, \\\n",
    "                        'query_feature': query_feature.numpy(), 'query_label':query_label, 'query_cam':query_cam}\n",
    "\n",
    "    result_name = os.path.join(save_path, name+'_feature.mat')\n",
    "    scipy.io.savemat(result_name, extracted_feature)        \n",
    "\n",
    "    return_dict = {}\n",
    "\n",
    "    return_dict['dist'] = dist.numpy()\n",
    "    return_dict['feature_example'] = query_feature[0].numpy()\n",
    "    return_dict['gallery_label'] = gallery_label\n",
    "    return_dict['gallery_cam'] = gallery_cam\n",
    "    return_dict['query_label'] = query_label\n",
    "    return_dict['query_cam'] = query_cam\n",
    "\n",
    "    pickle.dump(return_dict, open(OUTPUT_RESULT_DIR+'test_result.pkl', 'wb'), protocol=4)\n",
    "\n",
    "    return \n",
    "\n",
    "    # eval_result = evaluator(result, logger)\n",
    "    # full_table = display_eval_result(dict = eval_result)\n",
    "    # logger.info(full_table)\n",
    "\n",
    "if __name__==\"__main__\":\n",
    "    logger = get_stream_logger('TEST')\n",
    "    test(CONFIG_PATH, logger)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c27b171e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MindSpore",
   "language": "python",
   "name": "mindspore"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/wj-Untitled.ipynb
+++ b/notebooks/wj-Untitled.ipynb
@@ -0,0 +1,251 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "90e7b1d4",
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'torch'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-e39e8fd52943>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "from __future__ import print_function, division\n",
    "\n",
    "# import sys\n",
    "# sys.path.append('/home/xujiahong/openI_benchmark/vechicle_reID_VechicleNet/')\n",
    "\n",
    "import time\n",
    "import yaml\n",
    "import pickle\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torchvision import datasets,transforms\n",
    "import os\n",
    "import scipy.io\n",
    "from tqdm import tqdm\n",
    "from data_utils.model_train import ft_net\n",
    "from utils.util import get_stream_logger\n",
    "from config.mainconfig import OUTPUT_RESULT_DIR, CONFIG_PATH\n",
    "\n",
    "\n",
    "\n",
    "def fliplr(img):\n",
    "    '''flip horizontal'''\n",
    "    inv_idx = torch.arange(img.size(3)-1,-1,-1).long()  # N x C x H x W\n",
    "    img_flip = img.index_select(3,inv_idx)\n",
    "    return img_flip\n",
    "\n",
    "def extract_feature(model, dataloaders, flip):\n",
    "    features = torch.FloatTensor()\n",
    "    count = 0\n",
    "    for _, data in enumerate(tqdm(dataloaders),0):\n",
    "        img, _ = data\n",
    "        n, c, h, w = img.size()\n",
    "        count += n\n",
    "\n",
    "        input_img = img.cuda()\n",
    "        ff = model(input_img)\n",
    "\n",
    "        if flip:\n",
    "            img = fliplr(img)\n",
    "            input_img = img.cuda()\n",
    "            outputs_flip = model(input_img)\n",
    "            ff += outputs_flip\n",
    "\n",
    "        fnorm = torch.norm(ff, p=2, dim=1, keepdim=True)\n",
    "        ff = ff.div(fnorm.expand_as(ff))\n",
    "        #print(ff.shape)\n",
    "        features = torch.cat((features,ff.data.cpu().float()), 0)\n",
    "        #features = torch.cat((features,ff.data.float()), 0)\n",
    "    return features\n",
    "\n",
    "\n",
    "def get_id(img_path):\n",
    "    '''\n",
    "    xjh: \n",
    "    example of the name of the img: 0769_c013_00074310_0\n",
    "    0769 is the vehicleID, 013 is the cameraID,  00074310 is the frameID\n",
    "    '''\n",
    "    camera_id = []\n",
    "    labels = []\n",
    "    for path, _ in img_path:\n",
    "        #filename = path.split('/')[-1]\n",
    "        filename = os.path.basename(path) #get the name of images\n",
    "        # Test Gallery Image\n",
    "        if not 'c' in filename: \n",
    "            labels.append(9999999)\n",
    "            camera_id.append(9999999)\n",
    "        else:\n",
    "            #label = filename[0:4]\n",
    "            label = filename[0:5] #for benchmark_person\n",
    "            camera = filename.split('c')[1]\n",
    "            if label[0:2]=='-1':\n",
    "                labels.append(-1)\n",
    "            else:\n",
    "                labels.append(int(label))\n",
    "            #camera_id.append(int(camera[0:3]))\n",
    "            camera_id.append(int(camera[0:2]))#for benchmark_person\n",
    "        #print(camera[0:3])\n",
    "    return camera_id, labels\n",
    "\n",
    "\n",
    "def test(config_file_path:str, logger):\n",
    "    #read config files\n",
    "    with open(config_file_path, encoding='utf-8') as f:\n",
    "        opts = yaml.load(f, Loader=yaml.SafeLoader)\n",
    "\n",
    "    data_dir = opts['input']['dataset']['data_dir']\n",
    "    name = \"trained_\" + opts['input']['config']['name']\n",
    "    trained_model_name = name + \"_last.pth\"\n",
    "    save_path = OUTPUT_RESULT_DIR\n",
    "\n",
    "    nclass = opts['input']['config']['nclass']\n",
    "    stride = opts['input']['config']['stride']\n",
    "    pool = opts['input']['config']['pool']\n",
    "    droprate = opts['input']['config']['droprate']\n",
    "    inputsize= opts['input']['config']['inputsize']\n",
    "    w = opts['input']['config']['w']\n",
    "    h = opts['input']['config']['h']\n",
    "    batchsize = opts['input']['config']['batchsize']\n",
    "    flip = opts['test']['flip_test']\n",
    "\n",
    "    trained_model_path = os.path.join(save_path, trained_model_name)\n",
    "\n",
    "    ##############################load model#################################################\n",
    "    ###self-train\n",
    "    model =  ft_net(class_num = nclass, droprate = droprate, stride=stride, init_model=None, pool = pool, return_f=False)\n",
    "    \n",
    "    try:\n",
    "        model.load_state_dict(torch.load(trained_model_path))\n",
    "    except:\n",
    "        model = torch.nn.DataParallel(model)\n",
    "        model.load_state_dict(torch.load(trained_model_path))\n",
    "        model = model.module\n",
    "    model.classifier.classifier = nn.Sequential() #model ends with feature extractor(output len is 512)\n",
    "    # print(model)\n",
    "    \n",
    "    ##############################load dataset###############################################\n",
    "    \n",
    "    #transforms for input image h==w==299, inputsize==256\n",
    "    if h == w:\n",
    "        data_transforms = transforms.Compose([\n",
    "            transforms.Resize( ( round(inputsize*1.1), round(inputsize*1.1)), interpolation=3),\n",
    "            transforms.ToTensor(),\n",
    "            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "        ])\n",
    "    else:\n",
    "        data_transforms = transforms.Compose( [\n",
    "            transforms.Resize((round(h*1.1), round(w*1.1)), interpolation=3), #Image.BICUBIC\n",
    "            transforms.ToTensor(),\n",
    "            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "            ])\n",
    "\n",
    "    image_datasets = {x: datasets.ImageFolder( os.path.join(data_dir,x) ,data_transforms) for x in ['bounding_box_test','query']}\n",
    "    dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize,\n",
    "                                             shuffle=False, num_workers=8) for x in ['bounding_box_test','query']}\n",
    "\n",
    "    #############################check GPU###################################################\n",
    "    use_gpu = torch.cuda.is_available()\n",
    "\n",
    "\n",
    "    #############################extract features############################################\n",
    "    # Change to test mode\n",
    "    model = model.eval()\n",
    "    if use_gpu:\n",
    "        model = model.cuda()\n",
    "\n",
    "    gallery_path = image_datasets['bounding_box_test'].imgs\n",
    "    query_path = image_datasets['query'].imgs\n",
    "\n",
    "    gallery_cam,gallery_label = get_id(gallery_path)\n",
    "    query_cam,query_label = get_id(query_path)\n",
    "\n",
    "\n",
    "    gallery_label = np.asarray(gallery_label)\n",
    "    query_label = np.asarray(query_label)\n",
    "    gallery_cam = np.asarray(gallery_cam)\n",
    "    query_cam = np.asarray(query_cam)\n",
    "    print('Gallery Size: %d'%len(gallery_label))\n",
    "    print('Query Size: %d'%len(query_label))\n",
    "    # Extract feature\n",
    "    since = time.time()\n",
    "    with torch.no_grad():\n",
    "        gallery_feature = extract_feature(model, dataloaders['bounding_box_test'], flip)\n",
    "        query_feature = extract_feature(model, dataloaders['query'], flip)\n",
    "    process_time = time.time() - since\n",
    "    logger.info('total forward time: %.2f minutes'%(process_time/60))\n",
    "    \n",
    "    dist = 1-torch.mm(query_feature, torch.transpose(gallery_feature, 0, 1))\n",
    "\n",
    "    # Save to Matlab for check\n",
    "    extracted_feature = {'gallery_feature': gallery_feature.numpy(), 'gallery_label':gallery_label, 'gallery_cam':gallery_cam, \\\n",
    "                        'query_feature': query_feature.numpy(), 'query_label':query_label, 'query_cam':query_cam}\n",
    "\n",
    "    result_name = os.path.join(save_path, name+'_feature.mat')\n",
    "    scipy.io.savemat(result_name, extracted_feature)        \n",
    "\n",
    "    return_dict = {}\n",
    "\n",
    "    return_dict['dist'] = dist.numpy()\n",
    "    return_dict['feature_example'] = query_feature[0].numpy()\n",
    "    return_dict['gallery_label'] = gallery_label\n",
    "    return_dict['gallery_cam'] = gallery_cam\n",
    "    return_dict['query_label'] = query_label\n",
    "    return_dict['query_cam'] = query_cam\n",
    "\n",
    "    pickle.dump(return_dict, open(OUTPUT_RESULT_DIR+'test_result.pkl', 'wb'), protocol=4)\n",
    "\n",
    "    return \n",
    "\n",
    "    # eval_result = evaluator(result, logger)\n",
    "    # full_table = display_eval_result(dict = eval_result)\n",
    "    # logger.info(full_table)\n",
    "\n",
    "if __name__==\"__main__\":\n",
    "    logger = get_stream_logger('TEST')\n",
    "    test(CONFIG_PATH, logger)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c27b171e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MindSpore",
   "language": "python",
   "name": "mindspore"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/npu/lewis/c2net_npu.py
+++ b/npu/lewis/c2net_npu.py
@@ -0,0 +1,149 @@
 """
 ######################## train lenet example ########################
 train lenet and get network model files(.ckpt) 
 """
 #!/usr/bin/python
 #coding=utf-8
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image ###  
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0]
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
            #unzip dataset
            os.system("unzip -d %s %s" % (file_path, path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        MultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            MultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--multi_data_url',
                    help='dataset path in obs')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/dataset'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train"  
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target != "Ascend":
        model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
    else:
        model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
--- a/npu/lewis/c2net_npu_continue.py
+++ b/npu/lewis/c2net_npu_continue.py
@@ -0,0 +1,196 @@
 """
 ######################## train lenet example ########################
 train lenet and get network model files(.ckpt) 
 """
 #!/usr/bin/python
 #coding=utf-8
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image and unzip###  
 def C2netMultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        try:
            mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path))
            #get filename and unzip the dataset
            filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0]
            filePath = data_dir + "/" + filename
            if not os.path.exists(filePath):
                os.makedirs(filePath)
            os.system("unzip {} -d {}".format(zipfile_path, filePath))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], zipfile_path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy the output model to obs ###  
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,
                                                    obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,
                                                    obs_train_url) + str(e))
    return    
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        C2netMultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            C2netMultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--multi_data_url',
                    help='path to multi dataset',
                    default= '/cache/data/')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')
 parser.add_argument('--train_url',
                    help='model folder to save/load',
                    default= '/cache/output/')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 ### continue task parameters
 parser.add_argument('--ckpt_load_name',
                help='model name to load',
                default=  '')
 parser.add_argument('--ckpt_save_name',
                help='model name to save',
                default=  'checkpoint')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/dataset'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train"  
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ### 继续训练模型加载
    if args.ckpt_load_name:
        C2netMultiObsToEnv(args.train_url, train_dir)
        load_path = "{}/{}.ckpt".format(train_dir, args.ckpt_load_name)             
        load_param_into_net(network, load_checkpoint(load_path))
    if args.device_target != "Ascend":
        model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
    else:
        model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name,
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
--- a/npu/lewis/c2net_npu_multi_dataset.py
+++ b/npu/lewis/c2net_npu_multi_dataset.py
@@ -0,0 +1,197 @@
 """
 ######################## multi-dataset train lenet example ########################
 This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset 
 training tutorial train.py. This example cannot be used for a single dataset!
 """
 """
 ######################## Instructions for using the training environment ########################
 1、(1)The structure of the dataset uploaded for multi-dataset training in this example
 MNISTData.zip
  ├── test
  └── train 
 checkpoint_lenet-1_1875.zip
  ├── checkpoint_lenet-1_1875.ckpt
  (2)The dataset structure in the training image for multiple datasets in this example
  workroot
   ├── MNISTData
   |     ├── test
   |     └── train 
   └── checkpoint_lenet-1_1875
         ├── checkpoint_lenet-1_1875.ckpt
 2、Multi-dataset training requires predefined functions
 (1)Copy multi-dataset from obs to training image
 function MultiObsToEnv(multi_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (2)Download the input from Qizhi And Init 
 function DownloadFromQizhi(multi_data_url, data_dir)
 (2)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 3、4 parameters need to be defined
 --data_url is the first dataset you selected on the Qizhi platform
 --multi_data_url is the multi-dataset you selected on the Qizhi platform
 --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, 
 otherwise an error will be reported.     
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code                
 4、How the dataset is used
 Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the 
 calling path of the dataset in the training image.
 For example, the calling path of the train folder in the MNIST_Data dataset in this example is 
 data_dir + "/MNIST_Data" +"/train"
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image ###  
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0]
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
            #unzip dataset
            os.system("unzip -d %s %s" % (file_path, path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        MultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            MultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--multi_data_url',
                    help='dataset path in obs')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/dataset'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train"  
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ###The dataset path is used here:data_dir + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt"
    load_param_into_net(network, load_checkpoint(os.path.join(data_dir + "/checkpoint_lenet-1_1875", 
                        "checkpoint_lenet-1_1875.ckpt"))) 
    if args.device_target != "Ascend":
        model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
    else:
        model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
--- a/npu/lewis/c2net_npu_pretrain.py
+++ b/npu/lewis/c2net_npu_pretrain.py
@@ -0,0 +1,233 @@
 """
 ######################## single-dataset train lenet example ########################
 This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training 
 tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
 ######################## Instructions for using the training environment ########################
 The image of the debugging environment and the image of the training environment are two different images, 
 and the working local directories are different. In the training task, you need to pay attention to the following points.
 1、(1)The structure of the dataset uploaded for single dataset training in this example
 MNISTData.zip
  ├── test
  └── train
 2、Single dataset training requires predefined functions
 (1)Copy single dataset from obs to training image
 function ObsToEnv(obs_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (3)Download the input from Qizhi And Init 
 function DownloadFromQizhi(obs_data_url, data_dir)
 (4)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 (5)Copy ckpt file from obs to training image.
 function ObsUrlToEnv(obs_ckpt_url, ckpt_url)
 3、3 parameters need to be defined
 --data_url is the dataset you selected on the Qizhi platform
 --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, 
 otherwise an error will be reported.    
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code.                 
 4、How the dataset is used
 A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method
 of the dataset in the image.
 For details, please refer to the following sample code.
 5、How to load the checkpoint file
 The checkpoint file is loaded by the ckpt_url parameter
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import mindspore.ops as ops
 import time
 import json
 #from upload import UploadOutput
 ### Copy single dataset from obs to training image###
 def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy ckpt file from obs to training image###
 ### To operate on folders, use mox.file.copy_parallel. If copying a file. 
 ### Please use mox.file.copy to operate the file, this operation is to operate the file
 def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
    try:
        mox.file.copy(obs_ckpt_url, ckpt_url)
        print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) 
    return  
 ### Copy multiple datasets from obs to training image ###  
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0]
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
            #unzip dataset
            os.system("unzip -d %s %s" % (file_path, path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        MultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            MultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, 
 ### otherwise an error will be reported.
 ###There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ###because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--multi_data_url',
                    help='dataset path in obs')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/dataset'  
    train_dir = '/cache/output'
    ckpt_url = '/cache/checkpoint.ckpt'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    ###Copy ckpt file from obs to training image
    ObsUrlToEnv(args.ckpt_url, ckpt_url)
    ###Copy data from obs to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    ###The dataset path is used here:data_dir +"/train"   
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir+ "/MNISTData", "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir+ "/MNISTData", "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ###The ckpt path is used here:ckpt_url
    print('-------ckpt_url is:', args.ckpt_url)
    load_param_into_net(network, load_checkpoint(ckpt_url))
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
--- a/npu/lewis/c2net_testbigfile.py
+++ b/npu/lewis/c2net_testbigfile.py
@@ -0,0 +1,114 @@
 """
 ######################## train lenet example ########################
 train lenet and get network model files(.ckpt) 
 """
 #!/usr/bin/python
 #coding=utf-8
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image ###  
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0]
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
            #unzip dataset
            os.system("unzip -d %s %s" % (file_path, path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        MultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            MultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--multi_data_url',
                    help='dataset path in obs')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/dataset'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    print("--------start ls:")
    os.system("cd /cache/dataset; ls -al")
    print("--------end ls-----------")
--- a/npu/lewis/config.py
+++ b/npu/lewis/config.py
@@ -0,0 +1,33 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 network config setting, will be used in train.py
 """
 from easydict import EasyDict as edict
 mnist_cfg = edict({
    'num_classes': 10,
    'lr': 0.01,
    'momentum': 0.9,
    'epoch_size': 10,
    'batch_size': 32,
    'buffer_size': 1000,
    'image_height': 32,
    'image_width': 32,
    'save_checkpoint_steps': 1875,
    'keep_checkpoint_max': 150,
    'air_name': "lenet",
 })
--- a/npu/lewis/dataset.py
+++ b/npu/lewis/dataset.py
@@ -0,0 +1,60 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Produce the dataset
 """
 import mindspore.dataset as ds
 import mindspore.dataset.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.vision import Inter
 from mindspore.common import dtype as mstype
 def create_dataset(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1):
    """
    create dataset for train or test
    """
    # define dataset
    mnist_ds = ds.MnistDataset(data_path)
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081
    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)
    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)
    return mnist_ds
--- a/npu/lewis/dataset_distributed.py
+++ b/npu/lewis/dataset_distributed.py
@@ -0,0 +1,55 @@
 """
 Produce the dataset：
 与单机不同的是，在数据集接口需要传入num_shards和shard_id参数，分别对应卡的数量和逻辑序号，建议通过HCCL接口获取：
 get_rank：获取当前设备在集群中的ID。
 get_group_size：获取集群数量。
 """
 import mindspore.dataset as ds
 import mindspore.dataset.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.vision import Inter
 from mindspore.common import dtype as mstype
 from mindspore.communication.management import init, get_rank, get_group_size
 def create_dataset_parallel(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1, shard_id=0, num_shards=8):
    """
    create dataset for train or test
    """
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081
    # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters.
    shard_id = get_rank() 
    num_shards = get_group_size()
    # define dataset
    mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id)
    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)
    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)
    return mnist_ds
--- a/npu/lewis/lenet.py
+++ b/npu/lewis/lenet.py
@@ -0,0 +1,60 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """LeNet."""
 import mindspore.nn as nn
 from mindspore.common.initializer import Normal
 class LeNet5(nn.Cell):
    """
    Lenet network
    Args:
        num_class (int): Number of classes. Default: 10.
        num_channel (int): Number of channels. Default: 1.
    Returns:
        Tensor, output tensor
    Examples:
        >>> LeNet(num_class=10)
    """
    def __init__(self, num_class=10, num_channel=1, include_top=True):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')
        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
        self.relu = nn.ReLU()
        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
        self.include_top = include_top
        if self.include_top:
            self.flatten = nn.Flatten()
            self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02))
            self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02))
            self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02))
    def construct(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        if not self.include_top:
            return x
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
--- a/npu/pretrain.py
+++ b/npu/pretrain.py
@@ -0,0 +1,231 @@
 """
 ######################## single-dataset train lenet example ########################
 This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training 
 tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
 ######################## Instructions for using the training environment ########################
 The image of the debugging environment and the image of the training environment are two different images, 
 and the working local directories are different. In the training task, you need to pay attention to the following points.
 1、(1)The structure of the dataset uploaded for single dataset training in this example
 MNISTData.zip
  ├── test
  └── train
 2、Single dataset training requires predefined functions
 (1)Copy single dataset from obs to training image
 function ObsToEnv(obs_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (3)Download the input from Qizhi And Init 
 function DownloadFromQizhi(obs_data_url, data_dir)
 (4)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 (5)Copy ckpt file from obs to training image.
 function ObsUrlToEnv(obs_ckpt_url, ckpt_url)
 3、3 parameters need to be defined
 --data_url is the dataset you selected on the Qizhi platform
 --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, 
 otherwise an error will be reported.    
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code.                 
 4、How the dataset is used
 A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method
 of the dataset in the image.
 For details, please refer to the following sample code.
 5、How to load the checkpoint file
 The checkpoint file is loaded by the ckpt_url parameter
 In addition, if you want to get the model file after each training, you can call the UploadOutput.
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import mindspore.ops as ops
 import time
 from upload import UploadOutput
 ### Copy single dataset from obs to training image###
 def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy ckpt file from obs to training image###
 ### To operate on folders, use mox.file.copy_parallel. If copying a file. 
 ### Please use mox.file.copy to operate the file, this operation is to operate the file
 def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
    try:
        mox.file.copy(obs_ckpt_url, ckpt_url)
        print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) 
    return  
 ### Copy the output to obs###
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return      
 def DownloadFromQizhi(obs_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ObsToEnv(obs_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            ObsToEnv(obs_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, 
 ### otherwise an error will be reported.
 ###There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ###because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default= '/cache/data/')
 parser.add_argument('--train_url',
                    help='output folder to save/load',
                    default= '/cache/output/')
 parser.add_argument('--ckpt_url',
                help='model to save/load',
                default=  '/cache/checkpoint.ckpt')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/data'  
    train_dir = '/cache/output'
    ckpt_url = '/cache/checkpoint.ckpt'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    ###Copy ckpt file from obs to training image
    ObsUrlToEnv(args.ckpt_url, ckpt_url)
    ###Copy data from obs to training image
    DownloadFromQizhi(args.data_url, data_dir)
    ###The dataset path is used here:data_dir +"/train"   
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir, "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ###The ckpt path is used here:ckpt_url
    load_param_into_net(network, load_checkpoint(ckpt_url))
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    #Custom callback, upload output after each epoch
    uploadOutput = UploadOutput(train_dir,args.train_url)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor(), uploadOutput])
    ###Copy the trained output data from the local running environment back to obs,
    ###and download it in the training task corresponding to the Qizhi platform
    #This step is not required if UploadOutput is called
    UploadToQizhi(train_dir,args.train_url)
--- a/npu/pretrain_for_c2net.py
+++ b/npu/pretrain_for_c2net.py
@@ -0,0 +1,245 @@
 """
 ######################## Attention!  ########################
 智算网络需要在代码里使用mox拷贝数据集并解压，请参考函数C2netMultiObsToEnv
 The intelligent computing network needs to use mox to copy the dataset and decompress it in the code, 
 please refer to the function C2netMultiObsToEnv()
 ######################## multi-dataset train lenet example ########################
 This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset 
 training tutorial train.py. This example cannot be used for a single dataset!
 """
 """
 ######################## Instructions for using the training environment ########################
 1、(1)The structure of the dataset uploaded for multi-dataset training in this example
 MNISTData.zip
  ├── test
  └── train 
 checkpoint_lenet-1_1875.zip
  ├── checkpoint_lenet-1_1875.ckpt
  (2)The dataset structure in the training image for multiple datasets in this example
  workroot
   ├── MNISTData
   |     ├── test
   |     └── train 
   └── checkpoint_lenet-1_1875
         ├── checkpoint_lenet-1_1875.ckpt
 2、Multi-dataset training requires predefined functions
 (1)Copy multi-dataset from obs to training image and unzip
 function C2netMultiObsToEnv(multi_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (2)Download the input from Qizhi And Init 
 function DownloadFromQizhi(multi_data_url, data_dir)
 (2)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 3、4 parameters need to be defined
 --multi_data_url is the multi-dataset you selected on the Qizhi platform
 --multi_data_url,--train_url,--device_target,These 3 parameters must be defined first in a multi-dataset task, 
 otherwise an error will be reported.     
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code                
 4、How the dataset is used
 Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the 
 calling path of the dataset in the training image.
 For example, the calling path of the train folder in the MNIST_Data dataset in this example is 
 data_dir + "/MNIST_Data" +"/train"
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image and unzip###  
 def C2netMultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        try:
            mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path))
            #get filename and unzip the dataset
            filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0]
            filePath = data_dir + "/" + filename
            if not os.path.exists(filePath):
                os.makedirs(filePath)
            os.system("unzip {} -d {}".format(zipfile_path, filePath))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], zipfile_path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy ckpt file from obs to training image###
 ### To operate on folders, use mox.file.copy_parallel. If copying a file. 
 ### Please use mox.file.copy to operate the file, this operation is to operate the file
 def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
    try:
        mox.file.copy(obs_ckpt_url, ckpt_url)
        print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) 
    return      
 ### Copy the output model to obs ###  
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,
                                                    obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,
                                                    obs_train_url) + str(e))
    return                                                       
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        C2netMultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            C2netMultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --multi_data_url,--train_url,--device_target,These 3 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--multi_data_url',
                    help='path to multi dataset',
                    default= '/cache/data/')
 parser.add_argument('--ckpt_url',
                    help='pre_train_model path in obs')                    
 parser.add_argument('--train_url',
                    help='model folder to save/load',
                    default= '/cache/output/')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/data'  
    train_dir = '/cache/output'
    ckpt_url = '/cache/checkpoint.ckpt'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Copy ckpt file from obs to training image
    ObsUrlToEnv(args.ckpt_url, ckpt_url)        
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train"  
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ###The ckpt path is used here:ckpt_url
    load_param_into_net(network, load_checkpoint(ckpt_url))
    if args.device_target != "Ascend":
        model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
    else:
        model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir 
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
        # 测试代码。结果回传
        os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/code/")
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
    ###Copy the trained output data from the local running environment back to obs,
    ###and download it in the training task corresponding to the Qizhi platform
    UploadToQizhi(train_dir,args.train_url)
--- a/npu/testpaddle.py
+++ b/npu/testpaddle.py
@@ -0,0 +1,2 @@
 import paddle;
 print(-------"test, paddle version is:"+ paddle.__version__)
--- a/npu/train_continue.py
+++ b/npu/train_continue.py
@@ -0,0 +1,199 @@
 #####################################################################################################
 # 继续训练功能：修改训练任务时，若勾选复用上次结果，则可在新训练任务的输出路径中读取到上次结果
 #
 # 示例用法
 # - 增加两个训练参数 
 #    'ckpt_save_name' 此次任务的输出文件名称 
 #    'ckpt_load_name' 上一次任务的输出文件名，用于加载上一次输出的模型文件名称，默认为空，则不读取任何文件
 # - 训练代码中判断 'ckpt_load_name' 是否为空，若不为空，则为继续训练任务
 #####################################################################################################
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import mindspore.ops as ops
 import time
 from upload import UploadOutput
 ### Copy single file from obs to training image###
 def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy the output to obs###
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return     
 def DownloadFromQizhi(obs_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ObsToEnv(obs_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            ObsToEnv(obs_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, 
 ### otherwise an error will be reported.
 ###There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ###because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default= '/cache/data/')
 parser.add_argument('--train_url',
                    help='output folder to save/load',
                    default= '/cache/output/')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 ### continue task parameters
 parser.add_argument('--ckpt_load_name',
                help='model name to load',
                default=  '')
 parser.add_argument('--ckpt_save_name',
                help='model name to save',
                default=  'checkpoint')
 if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    data_dir = '/cache/data'  
    base_path = '/cache/output'
    try: 
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        if not os.path.exists(base_path):
            os.makedirs(base_path)
    except Exception as e:
        print("path already exists")
    ###Initialize and copy data to training image
    ###Copy data from obs to training image
    DownloadFromQizhi(args.data_url, data_dir)
    ###The dataset path is used here:data_dir +"/train"   
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir, "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ### 继续训练模型加载
    if args.ckpt_load_name:
        ObsToEnv(args.train_url, base_path)
        load_path = "{}/{}.ckpt".format(base_path,args.ckpt_load_name)             
        load_param_into_net(network, load_checkpoint(load_path))
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=1)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        save_path = base_path + "/"
    if device_num > 1:
        save_path = base_path + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name,
                                directory=save_path,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    #Custom callback, upload output after each epoch
    uploadOutput = UploadOutput(base_path,args.train_url)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor(), uploadOutput])
    ###Copy the trained output data from the local running environment back to obs,
    ###and download it in the training task corresponding to the Qizhi platform
    #This step is not required if UploadOutput is called
    UploadToQizhi(base_path,args.train_url)
--- a/npu/train_for_c2net_testcopy.py
+++ b/npu/train_for_c2net_testcopy.py
@@ -0,0 +1,92 @@
 """
 ######################## train lenet example ########################
 train lenet and get network model files(.ckpt) 
 """
 #!/usr/bin/python
 #coding=utf-8
 import os
 import argparse
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.common import set_seed
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU，需要在启智平台训练界面上加上运行参数device_target=Ascend')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 set_seed(1)
 if __name__ == "__main__":
    args = parser.parse_args()
    print('args:')
    print(args)
    train_dir = '/cache/output'
    data_dir = '/cache/dataset'
    #注意：这里很重要，指定了训练所用的设备CPU还是Ascend NPU
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)
    #创建数据集
    ds_train = create_dataset(os.path.join(data_dir, "train"),
                              cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    #创建网络
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #定义模型输出路径
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=train_dir,
                                 config=config_ck)
    #开始训练
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
        # 测试代码。结果回传
        os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/code/")
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
    print("============== Finish Training ==============")
--- a/npu/train_for_c2net_testcopy2.py
+++ b/npu/train_for_c2net_testcopy2.py
@@ -0,0 +1,92 @@
 """
 ######################## train lenet example ########################
 train lenet and get network model files(.ckpt) 
 """
 #!/usr/bin/python
 #coding=utf-8
 import os
 import argparse
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.common import set_seed
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU，需要在启智平台训练界面上加上运行参数device_target=Ascend')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 set_seed(1)
 if __name__ == "__main__":
    args = parser.parse_args()
    print('args:')
    print(args)
    train_dir = '/cache/output'
    data_dir = '/cache/dataset'
    #注意：这里很重要，指定了训练所用的设备CPU还是Ascend NPU
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)
    #创建数据集
    ds_train = create_dataset(os.path.join(data_dir, "train"),
                              cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    #创建网络
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #定义模型输出路径
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=train_dir,
                                 config=config_ck)
    #开始训练
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
        # 测试代码。结果回传
        os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/code")
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
    print("============== Finish Training ==============")
--- a/npu/upload.py
+++ b/npu/upload.py
@@ -0,0 +1,14 @@
 from mindspore.train.callback import Callback
 import moxing as mox
 class UploadOutput(Callback):
    def __init__(self, train_dir, obs_train_url):
        self.train_dir = train_dir
        self.obs_train_url = obs_train_url
    def epoch_end(self,run_context):
        try:
            mox.file.copy_parallel(self.train_dir , self.obs_train_url )
            print("Successfully Upload {} to {}".format(self.train_dir ,self.obs_train_url ))
        except Exception as e:
            print('moxing upload {} to {} failed: '.format(self.train_dir ,self.obs_train_url ) + str(e))
        return  
--- a/npu/upload_for_c2net.py
+++ b/npu/upload_for_c2net.py
@@ -0,0 +1,6 @@
 from mindspore.train.callback import Callback
 import os
 class UploadOutput(Callback):
    def epoch_end(self,run_context):
        os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/output/")
--- a/npu_multiNode/README.md
+++ b/npu_multiNode/README.md
@@ -0,0 +1,2 @@
 # MNIST_PytorchExample_npu_multiNode
--- a/npu_multiNode/config.py
+++ b/npu_multiNode/config.py
@@ -0,0 +1,33 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 network config setting, will be used in train.py
 """
 from easydict import EasyDict as edict
 mnist_cfg = edict({
    'num_classes': 10,
    'lr': 0.01,
    'momentum': 0.9,
    'epoch_size': 10,
    'batch_size': 32,
    'buffer_size': 1000,
    'image_height': 32,
    'image_width': 32,
    'save_checkpoint_steps': 1875,
    'keep_checkpoint_max': 150,
    'air_name': "lenet",
 })
--- a/npu_multiNode/dataset.py
+++ b/npu_multiNode/dataset.py
@@ -0,0 +1,60 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Produce the dataset
 """
 import mindspore.dataset as ds
 import mindspore.dataset.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.vision import Inter
 from mindspore.common import dtype as mstype
 def create_dataset(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1):
    """
    create dataset for train or test
    """
    # define dataset
    mnist_ds = ds.MnistDataset(data_path)
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081
    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)
    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)
    return mnist_ds
--- a/npu_multiNode/dataset_distributed.py
+++ b/npu_multiNode/dataset_distributed.py
@@ -0,0 +1,54 @@
 """
 Produce the dataset：
 与单机不同的是，在数据集接口需要传入num_shards和shard_id参数，分别对应卡的数量和逻辑序号，建议通过HCCL接口获取：
 get_rank：获取当前设备在集群中的ID。
 get_group_size：获取集群数量。
 """
 import mindspore.dataset as ds
 import mindspore.dataset.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.vision import Inter
 from mindspore.common import dtype as mstype
 from mindspore.communication.management import get_rank, get_group_size
 def create_dataset_parallel(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1, shard_id=0, num_shards=8):
    """
    create dataset for train or test
    """
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081
    # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters.
    shard_id = get_rank() 
    num_shards = get_group_size()
    # define dataset
    mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id)
    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)
    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)
    return mnist_ds
--- a/npu_multiNode/lenet.py
+++ b/npu_multiNode/lenet.py
@@ -0,0 +1,60 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """LeNet."""
 import mindspore.nn as nn
 from mindspore.common.initializer import Normal
 class LeNet5(nn.Cell):
    """
    Lenet network
    Args:
        num_class (int): Number of classes. Default: 10.
        num_channel (int): Number of channels. Default: 1.
    Returns:
        Tensor, output tensor
    Examples:
        >>> LeNet(num_class=10)
    """
    def __init__(self, num_class=10, num_channel=1, include_top=True):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')
        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
        self.relu = nn.ReLU()
        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
        self.include_top = include_top
        if self.include_top:
            self.flatten = nn.Flatten()
            self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02))
            self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02))
            self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02))
    def construct(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        if not self.include_top:
            return x
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
--- a/npu_multiNode/train.py
+++ b/npu_multiNode/train.py
@@ -0,0 +1,211 @@
 """
 ######################## single-dataset train lenet example ########################
 This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training 
 tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
 ######################## Instructions for using the training environment ########################
 The image of the debugging environment and the image of the training environment are two different images, 
 and the working local directories are different. In the training task, you need to pay attention to the following points.
 1、(1)The structure of the dataset uploaded for single dataset training in this example
 MNISTData.zip
  ├── test
  └── train
 2、Single dataset training requires predefined functions
 (1)Copy single dataset from obs to training image
 function ObsToEnv(obs_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (3)Download the input from Qizhi And Init 
 function DownloadFromQizhi(obs_data_url, data_dir)
 (4)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 3、3 parameters need to be defined
 --data_url is the dataset you selected on the Qizhi platform
 --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, 
 otherwise an error will be reported.    
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code.                 
 4、How the dataset is used
 A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method
 of the dataset in the image.
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank, get_group_size
 import mindspore.ops as ops
 import time
 ### Copy single dataset from obs to training image###
 def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy the output to obs###
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return      
 def DownloadFromQizhi(obs_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    node_num = get_group_size()
    if device_num == 1:
        ObsToEnv(obs_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1 and node_num == 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            ObsToEnv(obs_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    if  node_num > 1:
        ObsToEnv(obs_data_url,data_dir)
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, 
 ### otherwise an error will be reported.
 ###There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ###because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default= '/cache/data/')
 parser.add_argument('--train_url',
                    help='output folder to save/load',
                    default= '/cache/output/')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 parser.add_argument('--distributed',
                    type=bool,
                    default=True,
                    help='Whether to perform distributed training.')                    
 if __name__ == "__main__":
    args = parser.parse_args()
    data_dir = '/cache/data'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    if args.distributed:
        init()
    DownloadFromQizhi(args.data_url, data_dir)
    ###The dataset path is used here:data_dir +"/train"   
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir, "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
    ###Copy the trained output data from the local running environment back to obs,
    ###and download it in the training task corresponding to the Qizhi platform
    UploadToQizhi(train_dir,args.train_url)
--- a/npu_multiNode/说明.md
+++ b/npu_multiNode/说明.md
@@ -0,0 +1,5 @@
 需要是多机多节点的任务，比如2节点2卡
 数据集选1个就行，mnistData。
--- a/npu_new/README.md
+++ b/npu_new/README.md
@@ -0,0 +1,99 @@
 # 如何在启智平台上进行模型训练 - NPU版本
 - **启智集群和智算网络集群的单数据集训练，多数据集训练，训练使用方式不同，请按需求选择一种训练方式即可，注意区别(以下环境默认是训练环境)**：
  - 启智集群单数据集单卡或多卡的训练示例请参考示例中[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)的代码注释
  - 启智集群单数据集单卡的推理示例请参考示例中[inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py)的代码注释
  - 启智集群多数据集单卡或多卡的训练示例请参考示例中[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_multidataset.py)的代码注释
  - 智算网络集群单数据集单卡或多卡训练示例请参考示例中[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_c2net.py)的代码注释
  - 更多关于分布式训练的教程可参考mindspore官网教程[mindspore分布式训练教程](https://www.mindspore.cn/tutorial/training/zh-CN/r1.2/advanced_use/distributed_training_ascend.html)
 - **NPU启智集群中单数据集和多数据集的区别**：
  - 超参数不同：
 		单数据集的超参数通过--data_url传递
 		多数据集的超参数通过--multi_data_url传递，并且需要保留--data_url
  - 数据集使用方式不同：
 		如本示例中单数据集MNISTData.zip的使用方式是:数据集位于/cache/data下
 		多数据集时MNISTData.zip的使用方式是：数据集位于/cache/data/MNISTData/下
 - **NPU启智集群和智算网络集群的区别**：
  - 启智集群需要使用moxing拷贝数据到obs
  - 智算网络集群不需要moxing拷贝数据到obs
 - **NPU启智集群调试镜像和训练镜像的环境的区别**：
    - 若想要使用调试环境的多卡并行训练，可参考示例[调试环境多卡并行示例](https://git.openi.org.cn/OpenIOSSG/MNIST_Example_NPU_Debug)
 ## 1 概述
 - 本项目以LeNet-MNIST为例，简要介绍如何在启智AI协同平台上使用MindSpore完成训练任务，并提供单数据集的训练，多数据集的训练，智算网络的训练，单数据集推理等训练代码示例，旨在为AI开发者提供启智npu训练示例。对于示例代码有任何问题，欢迎在本项目中提issue。
 - 用户可以直接使用本项目提供的数据集和代码文件创建自己的训练任务。
 - 启智平台对接ModelArts和OBS，将数据集，代码，训练资源池等整合在启智AI协同平台上供开发者使用。
 	- ModelArts是华为云提供的面向开发者的一站式AI开发平台，集成了昇腾AI处理器资源池，用户可以在ModelArts下体验MindSpore。
 	- OBS是华为云提供的存储方式。
 ## 2 准备工作
 - 启智平台使用准备，本项目需要用户创建启智平台账户，克隆代码到自己的账户，上传数据集，具体操作方法可以通过访问[OpenI_Learning](https://git.openi.org.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。
 ### 2.1 数据准备
 #### 数据集下载
 - 数据集可从本项目的数据集目录中下载，[数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/datasets?type=1)
 - 数据文件说明
 	- MNISTData数据集是由10类28∗28的灰度图片组成，训练数据集包含60000张图片，测试数据集包含10000张图片。
 	- 数据集压缩包的目录结构如下：
 		>		MNIST_Data.zip
 		>		├── test
 		>		│   ├── t10k-images-idx3-ubyte
 		>		│   └── t10k-labels-idx1-ubyte
 		>		└── train
 		>  		    ├── train-images-idx3-ubyte
 		>    		└── train-labels-idx1-ubyte
 		> 		checkpoint_lenet-1_1875.zip
 		>  		├── checkpoint_lenet-1_1875.ckpt
 #### 数据集上传
 - 由于本示例使用的是Mindspore开发，需要在NPU芯片运行，所以上传的数据集需要传到NPU界面。\
 【注意：如果你需要试运行本示例，则无需再次上传数据集，因为本示例中的数据集MNIST_Example已经设置为公开数据集，可以直接引用或点赞收藏后使用】
 - 如下所示：
 - ![avatar](Example_Picture/数据集上传位置.png)
 ### 2.2 执行脚本准备
 #### 示例代码
 - 示例代码可从本仓库中下载，[代码下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example)
 - 代码文件说明
 	- [train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)，启智集群单数据集训练的脚本文件，包括将数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)的代码注释
    - [train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_c2net.py)，智算网络训练的脚本文件，包括指定迭代次数等。具体说明请参考[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_c2net.py)的代码注释
    - [train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_multidataset.py)，启智集群包括多数据集训练的脚本文件，将多数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_multidataset.py)的代码注释
 	- [inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py)，启智集群用于推理的脚本文件，包括将数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py)的代码注释
 	- [config.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/config.py)，网络配置信息，在单数据集训练，多数据集训练，智算网络训练等训练脚本中会使用到。
 	- [dataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/dataset.py)，对原始数据集进行预处理，产生可用于网络训练的数据集，在单数据集的训练，多数据集的训练，智算网络的训练等训练脚本中会使用到。
 	- [lenet.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/lenet.py)，使用的训练网络，在单数据集训练，多数据集训练，智算网络训练等训练脚本中会使用到。
 	- [dataset_distributes.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/dataset_distributes.py)，对原始数据集进行预处理，产生可用于单机多卡训练的数据集。
 ## 3 创建训练任务
 - 准备好数据和执行脚本以后，需要创建训练任务将MindSpore脚本真正运行起来。首次使用的用户可参考本示例代码。
 ### 使用MindSpore作为训练框架创建训练作业，界面截图如下图所示。
 ![avatar](Example_Picture/新建训练任务页面.png)
 表1 创建训练作业界面参数说明
 | 参数名称             | 说明        |
 | ----------------- | ----------- |
 | 代码分支              |  选择仓库代码中要使用的代码分支，默认可选择master分支。 |
 | AI引擎                 |  AI引擎选择[Ascend-Powered-Engine]和所需的MindSpore版本(本示例图片为 [Mindspore-1.3.0-python3.7-aarch64]，请注意使用与所选版本对应的脚本)。  |
 | 启动文件              |  启动文件选择代码目录下的启动脚本。  |
 | 数据集                 |  数据集选择已上传到启智平台的数据集。 |
 | 运行参数              |  单数据集数据存储位置和训练输出位置分别对应运行参数data_url和train_url，注意多数据集需要增加参数multi_data_url并在代码中声明，选择增加运行参数可以向脚本中其他参数传值，如epoch_size。在这里只需填入其他参数传值，data_url和train_url已默认加入运行参数，用户无需重复指定，只需在代码中指定。  |
 | 资源池                 |  规格选择[Ascend: 1 * Ascend 910 CPU：24 核 256GiB]，表示单机单卡 |
 <!-- 注：若要在启智平台上使用CPU，需要在启智平台训练界面上加上运行参数device_target=CPU，否则默认是Ascend，如下图所示
 ![avatar](Example_Picture/运行参数界面.png) -->
 ## 4 查看运行结果
 ### 4.1 在训练作业界面可以查看运行日志
 ![avatar](Example_Picture/查看日志页面.png)
 ### 4.2 训练结束后可以下载模型文件
 ![avatar](Example_Picture/模型下载页面.png)
 ## 对于示例代码有任何问题，欢迎在本项目中提issue。
--- a/npu_new/config.py
+++ b/npu_new/config.py
@@ -0,0 +1,33 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 network config setting, will be used in train.py
 """
 from easydict import EasyDict as edict
 mnist_cfg = edict({
    'num_classes': 10,
    'lr': 0.01,
    'momentum': 0.9,
    'epoch_size': 10,
    'batch_size': 32,
    'buffer_size': 1000,
    'image_height': 32,
    'image_width': 32,
    'save_checkpoint_steps': 1875,
    'keep_checkpoint_max': 150,
    'air_name': "lenet",
 })
--- a/npu_new/dataset.py
+++ b/npu_new/dataset.py
@@ -0,0 +1,60 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Produce the dataset
 """
 import mindspore.dataset as ds
 import mindspore.dataset.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.vision import Inter
 from mindspore.common import dtype as mstype
 def create_dataset(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1):
    """
    create dataset for train or test
    """
    # define dataset
    mnist_ds = ds.MnistDataset(data_path)
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081
    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)
    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)
    return mnist_ds
--- a/npu_new/dataset_distributed.py
+++ b/npu_new/dataset_distributed.py
@@ -0,0 +1,54 @@
 """
 Produce the dataset：
 与单机不同的是，在数据集接口需要传入num_shards和shard_id参数，分别对应卡的数量和逻辑序号，建议通过HCCL接口获取：
 get_rank：获取当前设备在集群中的ID。
 get_group_size：获取集群数量。
 """
 import mindspore.dataset as ds
 import mindspore.dataset.vision.c_transforms as CV
 import mindspore.dataset.transforms.c_transforms as C
 from mindspore.dataset.vision import Inter
 from mindspore.common import dtype as mstype
 from mindspore.communication.management import get_rank, get_group_size
 def create_dataset_parallel(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1, shard_id=0, num_shards=8):
    """
    create dataset for train or test
    """
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081
    # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters.
    shard_id = get_rank() 
    num_shards = get_group_size()
    # define dataset
    mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id)
    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)
    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)
    return mnist_ds
--- a/npu_new/inference.py
+++ b/npu_new/inference.py
@@ -0,0 +1,139 @@
 """
 ######################## single-dataset inference lenet example ########################
 This example is a single-dataset inference tutorial. 
 ######################## Instructions for using the inference environment ########################
 1、Inference task requires predefined functions
 (1)Copy single dataset from obs to inference image.
 function ObsToEnv(obs_data_url, data_dir)
 (2)Copy ckpt file from obs to inference image.
 function ObsUrlToEnv(obs_ckpt_url, ckpt_url)
 (3)Copy the output result to obs.
 function EnvToObs(train_dir, obs_train_url)
 3、4 parameters need to be defined.
 --data_url is the dataset you selected on the Qizhi platform
 --ckpt_url is the weight file you choose on the Qizhi platform
 --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset, 
 otherwise an error will be reported. 
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code.                    
 4、How the dataset is used
 Inference task uses data_url as the input, and data_dir (ie: '/cache/data') as the calling method
 of the dataset in the image.
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import Tensor
 import numpy as np
 from glob import glob
 from dataset import create_dataset
 from config import mnist_cfg as cfg
 from lenet import LeNet5
 ### Copy single dataset from obs to inference image ###
 def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    return 
 ### Copy ckpt file from obs to inference image###
 ### To operate on folders, use mox.file.copy_parallel. If copying a file. 
 ### Please use mox.file.copy to operate the file, this operation is to operate the file
 def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
    try:
        mox.file.copy(obs_ckpt_url, ckpt_url)
        print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) 
    return  
 ### Copy the output result to obs###
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return      
 ### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task, 
 ### otherwise an error will be reported.
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--data_url',
                type=str,
                default= '/cache/data/',
                help='path where the dataset is saved')      
 parser.add_argument('--ckpt_url',
                help='model to save/load',
                default=  '/cache/checkpoint.ckpt')  
 parser.add_argument('--result_url',
                help='result folder to save/load',
                default= '/cache/result/')   
 parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
                    help='device where the code will be implemented (default: Ascend)')                
 if __name__ == "__main__":            
    args = parser.parse_args()
    ###Initialize the data and result directories in the inference image###
    data_dir = '/cache/data'  
    result_dir = '/cache/result'
    ckpt_url = '/cache/checkpoint.ckpt'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(result_dir):
            os.makedirs(result_dir)
    ###Copy dataset from obs to inference image
    ObsToEnv(args.data_url, data_dir)
    ###Copy ckpt file from obs to inference image
    ObsUrlToEnv(args.ckpt_url, ckpt_url)
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    repeat_size = cfg.epoch_size
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(os.path.join(ckpt_url))
    load_param_into_net(network, param_dict)
    ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator()
    data = next(ds_test)
    images = data["image"].asnumpy()
    labels = data["label"].asnumpy()
    print('Tensor:', Tensor(data['image']))
    output = model.predict(Tensor(data['image']))
    predicted = np.argmax(output.asnumpy(), axis=1)
    pred = np.argmax(output.asnumpy(), axis=1)
    print('predicted:', predicted)
    print('pred:', pred)
    print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"')
    filename = 'result.txt'
    file_path = os.path.join(result_dir, filename)
    with open(file_path, 'a+') as file:
            file.write("    {}: {:.2f} \n".format("Predicted", predicted[0]))
    ###Copy result data from the local running environment back to obs,
    ###and download it in the inference task corresponding to the Qizhi platform
    EnvToObs(result_dir, args.result_url)
--- a/npu_new/inference_for_multidataset.py
+++ b/npu_new/inference_for_multidataset.py
@@ -0,0 +1,158 @@
 """
 ######################## multi-dataset inference lenet example ########################
 This example is a single-dataset inference tutorial. 
 ######################## Instructions for using the inference environment ########################
 1、Inference task requires predefined functions
 (1)Copy multi dataset from obs to inference image.
 function MultiObsToEnv(obs_data_url, data_dir)
 (2)Copy ckpt file from obs to inference image.
 function ObsUrlToEnv(obs_ckpt_url, ckpt_url)
 (3)Copy the output result to obs.
 function EnvToObs(train_dir, obs_train_url)
 3、5 parameters need to be defined.
 --data_url is the first dataset you selected on the Qizhi platform
 --multi_data_url is the multi dataset you selected on the Qizhi platform
 --ckpt_url is the weight file you choose on the Qizhi platform
 --result_url is the output 
 --data_url,--multi_data_url,--ckpt_url,--result_url,--device_target,These 5 parameters must be defined first in a single dataset, 
 otherwise an error will be reported. 
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code.                    
 4、How the dataset is used
 Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the 
 calling path of the dataset in the inference image.
 For example, the calling path of the test folder in the MNIST_Data dataset in this example is 
 data_dir + "/MNIST_Data" +"/test"
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import Tensor
 import numpy as np
 from glob import glob
 from dataset import create_dataset
 from config import mnist_cfg as cfg
 from lenet import LeNet5
 import json
 ### Copy multiple datasets from obs to inference image ###
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    return 
 ### Copy ckpt file from obs to inference image###
 ### To operate on folders, use mox.file.copy_parallel. If copying a file. 
 ### Please use mox.file.copy to operate the file, this operation is to operate the file
 def ObsUrlToEnv(obs_ckpt_url, ckpt_url):
    try:
        mox.file.copy(obs_ckpt_url, ckpt_url)
        print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) 
    return  
 ### Copy the output result to obs###
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return      
 ### --data_url,--multi_data_url,--ckpt_url,--result_url,--device_target,These 5 parameters must be defined first in a multi dataset inference task, 
 ### otherwise an error will be reported.
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--data_url',
                type=str,
                default= '/cache/data1/',
                help='path where the dataset is saved')      
 parser.add_argument('--multi_data_url',
                type=str,
                default= '/cache/data/',
                help='path where the dataset is saved')                
 parser.add_argument('--ckpt_url',
                help='model to save/load',
                default=  '/cache/checkpoint.ckpt')  
 parser.add_argument('--result_url',
                help='result folder to save/load',
                default= '/cache/result/')   
 parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
                    help='device where the code will be implemented (default: Ascend)')                
 if __name__ == "__main__":            
    args = parser.parse_args()
    ###Initialize the data and result directories in the inference image###
    data_dir = '/cache/data'  
    result_dir = '/cache/result'
    ckpt_url = '/cache/checkpoint.ckpt'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    ###Copy multiple dataset from obs to inference image
    MultiObsToEnv(args.multi_data_url, data_dir)
    ###Copy ckpt file from obs to inference image
    ObsUrlToEnv(args.ckpt_url, ckpt_url)
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    repeat_size = cfg.epoch_size
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(os.path.join(ckpt_url))
    load_param_into_net(network, param_dict)
    ds_test = create_dataset(os.path.join(data_dir + "/MNISTData", "test"), batch_size=1).create_dict_iterator()
    data = next(ds_test)
    images = data["image"].asnumpy()
    labels = data["label"].asnumpy()
    print('Tensor:', Tensor(data['image']))
    output = model.predict(Tensor(data['image']))
    predicted = np.argmax(output.asnumpy(), axis=1)
    pred = np.argmax(output.asnumpy(), axis=1)
    print('predicted:', predicted)
    print('pred:', pred)
    print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"')
    filename = 'result.txt'
    file_path = os.path.join(result_dir, filename)
    with open(file_path, 'a+') as file:
            file.write("    {}: {:.2f} \n".format("Predicted", predicted[0]))
    ###Copy result data from the local running environment back to obs,
    ###and download it in the inference task corresponding to the Qizhi platform
    EnvToObs(result_dir, args.result_url)
--- a/npu_new/lenet.py
+++ b/npu_new/lenet.py
@@ -0,0 +1,60 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """LeNet."""
 import mindspore.nn as nn
 from mindspore.common.initializer import Normal
 class LeNet5(nn.Cell):
    """
    Lenet network
    Args:
        num_class (int): Number of classes. Default: 10.
        num_channel (int): Number of channels. Default: 1.
    Returns:
        Tensor, output tensor
    Examples:
        >>> LeNet(num_class=10)
    """
    def __init__(self, num_class=10, num_channel=1, include_top=True):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')
        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')
        self.relu = nn.ReLU()
        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)
        self.include_top = include_top
        if self.include_top:
            self.flatten = nn.Flatten()
            self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02))
            self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02))
            self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02))
    def construct(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.max_pool2d(x)
        if not self.include_top:
            return x
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
--- a/npu_new/train.py
+++ b/npu_new/train.py
@@ -0,0 +1,201 @@
 """
 ######################## single-dataset train lenet example ########################
 This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training 
 tutorial train_for_multidataset.py. This example cannot be used for multi-datasets!
 ######################## Instructions for using the training environment ########################
 The image of the debugging environment and the image of the training environment are two different images, 
 and the working local directories are different. In the training task, you need to pay attention to the following points.
 1、(1)The structure of the dataset uploaded for single dataset training in this example
 MNISTData.zip
  ├── test
  └── train
 2、Single dataset training requires predefined functions
 (1)Copy single dataset from obs to training image
 function ObsToEnv(obs_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (3)Download the input from Qizhi And Init 
 function DownloadFromQizhi(obs_data_url, data_dir)
 (4)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 3、3 parameters need to be defined
 --data_url is the dataset you selected on the Qizhi platform
 --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, 
 otherwise an error will be reported.    
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code.                 
 4、How the dataset is used
 A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method
 of the dataset in the image.
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import mindspore.ops as ops
 import time
 ### Copy single dataset from obs to training image###
 def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy the output to obs###
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return      
 def DownloadFromQizhi(obs_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ObsToEnv(obs_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            ObsToEnv(obs_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 ### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, 
 ### otherwise an error will be reported.
 ###There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ###because they are predefined in the background, you only need to define them in your code.
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default= '/cache/data/')
 parser.add_argument('--train_url',
                    help='output folder to save/load',
                    default= '/cache/output/')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args = parser.parse_args()
    data_dir = '/cache/data'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.data_url, data_dir)
    ###The dataset path is used here:data_dir +"/train"   
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir, "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
    ###Copy the trained output data from the local running environment back to obs,
    ###and download it in the training task corresponding to the Qizhi platform
    UploadToQizhi(train_dir,args.train_url)
--- a/npu_new/train_for_c2net.py
+++ b/npu_new/train_for_c2net.py
@@ -0,0 +1,99 @@
 """
 ######################## train lenet dataparallel example ########################
 train lenet and get network model files(.ckpt) 
 The training of the intelligent computing network currently supports single dataset training, and does not require 
 the obs copy process.It only needs to define two parameters and then call it directly：
    train_dir = '/cache/output'  #The location of the output
    data_dir = '/cache/dataset'  #The location of the dataset
 """
 import os
 import argparse
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 import moxing as mox
 from config import mnist_cfg as cfg
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.common import set_seed
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank, get_group_size
 import mindspore.ops as ops
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args = parser.parse_args()
    ###define two parameters and then call it directly###
    data_dir = '/cache/dataset'
    train_dir = '/cache/output'
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
        ds_train = create_dataset(os.path.join(data_dir, "train"),  cfg.batch_size)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        ds_train = create_dataset_parallel(os.path.join(data_dir, "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()},
                      amp_level="O2")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In the example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=False)
--- a/npu_new/train_for_multidataset.py
+++ b/npu_new/train_for_multidataset.py
@@ -0,0 +1,220 @@
 """
 ######################## multi-dataset train lenet example ########################
 This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset 
 training tutorial train.py. This example cannot be used for a single dataset!
 """
 """
 ######################## Instructions for using the training environment ########################
 1、(1)The structure of the dataset uploaded for multi-dataset training in this example
 MNISTData.zip
  ├── test
  └── train 
 checkpoint_lenet-1_1875.zip
  ├── checkpoint_lenet-1_1875.ckpt
  (2)The dataset structure in the training image for multiple datasets in this example
  workroot
   ├── MNISTData
   |     ├── test
   |     └── train 
   └── checkpoint_lenet-1_1875
         ├── checkpoint_lenet-1_1875.ckpt
 2、Multi-dataset training requires predefined functions
 (1)Copy multi-dataset from obs to training image
 function MultiObsToEnv(multi_data_url, data_dir)
 (2)Copy the output to obs
 function EnvToObs(train_dir, obs_train_url)
 (2)Download the input from Qizhi And Init 
 function DownloadFromQizhi(multi_data_url, data_dir)
 (2)Upload the output to Qizhi 
 function UploadToQizhi(train_dir, obs_train_url)
 3、4 parameters need to be defined
 --data_url is the first dataset you selected on the Qizhi platform
 --multi_data_url is the multi-dataset you selected on the Qizhi platform
 --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, 
 otherwise an error will be reported.     
 There is no need to add these parameters to the running parameters of the Qizhi platform, 
 because they are predefined in the background, you only need to define them in your code                
 4、How the dataset is used
 Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the 
 calling path of the dataset in the training image.
 For example, the calling path of the train folder in the MNIST_Data dataset in this example is 
 data_dir + "/MNIST_Data" +"/train"
 For details, please refer to the following sample code.
 """
 import os
 import argparse
 import moxing as mox
 from config import mnist_cfg as cfg
 from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import json
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train import Model
 from mindspore.nn.metrics import Accuracy
 from mindspore import load_checkpoint, load_param_into_net
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
 ### Copy multiple datasets from obs to training image ###  
 def MultiObsToEnv(multi_data_url, data_dir):
    #--multi_data_url is json data, need to do json parsing for multi_data_url
    multi_data_json = json.loads(multi_data_url)  
    for i in range(len(multi_data_json)):
        path = data_dir + "/" + multi_data_json[i]["dataset_name"]
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) 
            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path))
        except Exception as e:
            print('moxing download {} to {} failed: '.format(
                multi_data_json[i]["dataset_url"], path) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
 ### Copy the output model to obs ###  
 def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,
                                                    obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,
                                                    obs_train_url) + str(e))
    return                                                        
 def DownloadFromQizhi(multi_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        MultiObsToEnv(multi_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            MultiObsToEnv(multi_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
 def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return
 parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
 ### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset,
 ### otherwise an error will be reported. 
 ### There is no need to add these parameters to the running parameters of the Qizhi platform, 
 ### because they are predefined in the background, you only need to define them in your code.
 parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default=  '/cache/data1/')
 parser.add_argument('--multi_data_url',
                    help='path to multi dataset',
                    default= '/cache/data/')
 parser.add_argument('--train_url',
                    help='model folder to save/load',
                    default= '/cache/output/')
 parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
 parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')
 if __name__ == "__main__":
    args = parser.parse_args()
    data_dir = '/cache/data'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.multi_data_url, data_dir)
    ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train"  
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if device_num > 1:
        ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"),  cfg.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")
    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    ###The dataset path is used here:data_dir + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt"
    load_param_into_net(network, load_checkpoint(os.path.join(data_dir + "/checkpoint_lenet-1_1875", 
                        "checkpoint_lenet-1_1875.ckpt"))) 
    if args.device_target != "Ascend":
        model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()})
    else:
        model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2")
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                keep_checkpoint_max=cfg.keep_checkpoint_max)
    #Note that this method saves the model file on each card. You need to specify the save path on each card.
    # In this example, get_rank() is added to distinguish different paths.
    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                directory=outputDirectory,
                                config=config_ck)
    print("============== Starting Training ==============")
    epoch_size = cfg['epoch_size']
    if (args.epoch_size):
        epoch_size = args.epoch_size
        print('epoch_size is: ', epoch_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
    ###Copy the trained output data from the local running environment back to obs,
    ###and download it in the training task corresponding to the Qizhi platform
    UploadToQizhi(train_dir,args.train_url)
--- a/test.py
+++ b/test.py
@@ -0,0 +1 @@
 print('this is test.py, test 123')
--- a/test引号.md
+++ b/test引号.md
@@ -0,0 +1,10 @@
 ""
 “中文双引号”
 "vim"
 “”
 hello world “中文双引号”
 hi "en"
 'mark'
 ''
 "abc hello"
--- a/train.py
+++ b/train.py
@@ -0,0 +1,91 @@
 #!/usr/bin/python
 #coding=utf-8    
 '''
 If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8    
 Due to the adaptability of a100, before using the training environment, please use the recommended image of the 
 platform with cuda 11.Then adjust the code and submit the image.
 The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191
 In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. 
 If it is a single dataset: 
 if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test;
 If it is a multiple dataset: 
 If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, 
 the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test
 and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl
 The model download path is under /model by default. Please specify the model output location to /model, 
 and the Qizhi platform will provide file downloads under the /model directory.
 '''
 from model import Model
 import numpy as np
 import torch
 from torchvision.datasets import mnist
 from torch.nn import CrossEntropyLoss
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor
 import argparse
 import datetime
 # Training settings
 parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
 #The dataset location is placed under /dataset
 parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset')
 parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset')
 parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train')
 parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch')
 def gettime():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    return timestr
 if __name__ == '__main__':
    args, unknown = parser.parse_known_args()
    #log output
    print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available()))  
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = args.batch_size
    train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False)
    test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False)
    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model = Model().to(device)
    sgd = SGD(model.parameters(), lr=1e-1)
    cost = CrossEntropyLoss()
    epoch = args.epoch_size
    print(gettime(), 'epoch_size is:{}'.format(epoch))
    for _epoch in range(epoch):
        print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1))
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            train_x = train_x.to(device)
            train_label = train_label.to(device)
            label_np = np.zeros((train_label.shape[0], 10))
            sgd.zero_grad()
            predict_y = model(train_x.float())
            loss = cost(predict_y, train_label.long())
            if idx % 10 == 0:
                print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item()))
            loss.backward()
            sgd.step()
        correct = 0
        _sum = 0
        model.eval()
        for idx, (test_x, test_label) in enumerate(test_loader):
            test_x = test_x
            test_label = test_label
            predict_y = model(test_x.to(device).float()).detach()
            predict_ys = np.argmax(predict_y.cpu(), axis=-1)
            label_np = test_label.numpy()
            _ = predict_ys == test_label
            correct += np.sum(_.numpy(), axis=-1)
            _sum += _.shape[0]
        print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum))
        #The model output location is placed under /model
        torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum))
--- a/wjtes20220926-log.txt
+++ b/wjtes20220926-log.txt
@@ -0,0 +1,233 @@
 /home/work
 start loading script
 finish loading script
 2022/09/26 16:24:20 Start to download master.zip 
 2022/09/26 16:24:20 Total parts count 1
 2022/09/26 16:24:21 part(1) finished
 2022/09/26 16:24:21 Download object finished, downloadPath:/cache/code/master.zip
 panic: runtime error: index out of range [4] with length 4
 goroutine 1 [running]:
 main.main()
 	/home/houysh/openi/lewis/sync_for_grampus/downloader_for_obs.go:41 +0x4e0
 unzip finished;start to exec code;
 do nothing
 [Modelarts Service Log]user: uid=1101(work) gid=1101(work) groups=1101(work),1000(HwHiAiUser)
 [Modelarts Service Log]pwd: /home/work
 [Modelarts Service Log]boot_file: /cache/code/npu_test/npu/train_for_c2net.py
 [Modelarts Service Log]log_url: /tmp/log/train.log
 [Modelarts Service Log]command: /cache/code/npu_test/npu/train_for_c2net.py
 [Modelarts Service Log]local_code_dir: 
 [Modelarts Service Log]Training start at 2022-09-26-16:24:21
 [Modelarts Service Log][modelarts_create_log] modelarts-pipe found
 [ModelArts Service Log]modelarts-pipe: will create log file /tmp/log/train.log
 [Modelarts Service Log][modelarts_logger] modelarts-pipe found
 [ModelArts Service Log]modelarts-pipe: will create log file /tmp/log/train.log
 [ModelArts Service Log]modelarts-pipe: will write log file /tmp/log/train.log
 [ModelArts Service Log]modelarts-pipe: param for max log length: 1073741824
 [ModelArts Service Log]modelarts-pipe: param for whether exit on overflow: 0
 INFO:root:Using MoXing-v2.0.0.rc2.4b57a67b-4b57a67b
 INFO:root:Using OBS-Python-SDK-3.20.9.1
 [Modelarts Service Log]2022-09-26 16:24:22,746 - INFO - Ascend Driver: Version=22.0.0.3
 [Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - you are advised to use ASCEND_DEVICE_ID env instead of DEVICE_ID, as the DEVICE_ID env will be discarded in later versions
 [Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - particularly, ${ASCEND_DEVICE_ID} == ${DEVICE_ID}, it's the logical device id
 [Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - Davinci training command
 [Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - ['/usr/bin/python', '/cache/code/npu_test/npu/train_for_c2net.py']
 [Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - Wait for Rank table file ready
 [Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - Rank table file (K8S generated) is ready for read
 [Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - 
 {
    "status": "completed",
    "group_count": "1",
    "group_list": [
        {
            "group_name": "job-wjtes2022092616t2327",
            "device_count": "1",
            "instance_count": "1",
            "instance_list": [
                {
                    "pod_name": "joba57ac677-job-wjtes2022092616t2327-0",
                    "server_id": "192.168.0.189",
                    "devices": [
                        {
                            "device_id": "3",
                            "device_ip": "192.4.68.236"
                        }
                    ]
                }
            ]
        }
    ]
 }
 [Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - Rank table file (C7x)
 [Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - 
 {
    "status": "completed",
    "version": "1.0",
    "server_count": "1",
    "server_list": [
        {
            "server_id": "192.168.0.189",
            "device": [
                {
                    "device_id": "3",
                    "device_ip": "192.4.68.236",
                    "rank_id": "0"
                }
            ]
        }
    ]
 }
 [Modelarts Service Log]2022-09-26 16:24:22,749 - INFO - Rank table file (C7x) is generated
 [Modelarts Service Log]2022-09-26 16:24:22,749 - INFO - Current server
 [Modelarts Service Log]2022-09-26 16:24:22,749 - INFO - 
 {
    "server_id": "192.168.0.189",
    "device": [
        {
            "device_id": "3",
            "device_ip": "192.4.68.236",
            "rank_id": "0"
        }
    ]
 }
 [Modelarts Service Log]2022-09-26 16:24:22,750 - INFO - bootstrap proc-rank-0-device-0
 args:
 Namespace(device_target='Ascend', epoch_size=5)
 Traceback (most recent call last):
  File "/cache/code/npu_test/npu/train_for_c2net.py", line 50, in <module>
    cfg.batch_size)
  File "/cache/code/npu_test/npu/dataset.py", line 32, in create_dataset
    mnist_ds = ds.MnistDataset(data_path)
  File "/usr/local/ma/python3.7/lib/python3.7/site-packages/mindspore/dataset/engine/validators.py", line 343, in new_method
    check_dir(dataset_dir)
  File "/usr/local/ma/python3.7/lib/python3.7/site-packages/mindspore/dataset/core/validator_helpers.py", line 551, in check_dir
    raise ValueError("The folder {} does not exist or is not a directory or permission denied!".format(dataset_dir))
 ValueError: The folder /cache/dataset/train does not exist or is not a directory or permission denied!
 [Modelarts Service Log]2022-09-26 16:24:31,765 - ERROR - proc-rank-0-device-0 (pid: 159) has exited with non-zero code: 1
 [Modelarts Service Log]2022-09-26 16:24:31,765 - INFO - Begin destroy training processes
 [Modelarts Service Log]2022-09-26 16:24:31,765 - INFO - proc-rank-0-device-0 (pid: 159) has exited
 [Modelarts Service Log]2022-09-26 16:24:31,765 - INFO - End destroy training processes
 [ModelArts Service Log]modelarts-pipe: total length: 3763
 [Modelarts Service Log]Training end with return code: 1
 [Modelarts Service Log]Training end at 2022-09-26-16:24:31
 [Modelarts Service Log]Training completed.
 2022/09/26 16:24:51 start uploading model
 2022/09/26 16:24:51 file:train.log
 2022/09/26 16:24:52 finish uploading model