diff --git a/001.ipynb b/001.ipynb new file mode 100755 index 0000000..95d09f2 --- /dev/null +++ b/001.ipynb @@ -0,0 +1 @@ +hello world \ No newline at end of file diff --git a/1.ipynb b/1.ipynb new file mode 100755 index 0000000..bd120ec --- /dev/null +++ b/1.ipynb @@ -0,0 +1,23 @@ +# Wild type sequence provided in the "Dataset Description": +wtseq <- 'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK' + +# Read testing set sequences and pH: +test <- read.csv('../input/novozymes-enzyme-stability-prediction/test.csv') + +# Add mutation information to testing set: +test[,c('type','resid','wt','mut')] <- do.call(rbind,lapply(test$protein_sequence,function(seq){ + # case 1 = wild type: + if(seq==wtseq){ + return(c('WT',-1,'_','_')) + # case 2 = substitution: + } else if(nchar(seq)==nchar(wtseq)){ + i <- mapply(function(x,y) which(x!=y)[1], strsplit(seq,""), strsplit(wtseq,"")) + return(c('SUB',i,substr(wtseq,i,i),substr(seq,i,i))) + # case 3 = deletion: + } else if(nchar(seq)\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'" + ] + } + ], + "source": [ + "# -*- coding: utf-8 -*-\n", + "from __future__ import print_function, division\n", + "\n", + "# import sys\n", + "# sys.path.append('/home/xujiahong/openI_benchmark/vechicle_reID_VechicleNet/')\n", + "\n", + "import time\n", + "import yaml\n", + "import pickle\n", + "import torch\n", + "import torch.nn as nn\n", + "import numpy as np\n", + "from torchvision import datasets,transforms\n", + "import os\n", + "import scipy.io\n", + "from tqdm import tqdm\n", + "from data_utils.model_train import ft_net\n", + "from utils.util import get_stream_logger\n", + "from config.mainconfig import OUTPUT_RESULT_DIR, CONFIG_PATH\n", + "\n", + "\n", + "\n", + "def fliplr(img):\n", + " '''flip horizontal'''\n", + " inv_idx = torch.arange(img.size(3)-1,-1,-1).long() # N x C x H x W\n", + " img_flip = img.index_select(3,inv_idx)\n", + " return img_flip\n", + "\n", + "def extract_feature(model, dataloaders, flip):\n", + " features = torch.FloatTensor()\n", + " count = 0\n", + " for _, data in enumerate(tqdm(dataloaders),0):\n", + " img, _ = data\n", + " n, c, h, w = img.size()\n", + " count += n\n", + "\n", + " input_img = img.cuda()\n", + " ff = model(input_img)\n", + "\n", + " if flip:\n", + " img = fliplr(img)\n", + " input_img = img.cuda()\n", + " outputs_flip = model(input_img)\n", + " ff += outputs_flip\n", + "\n", + " fnorm = torch.norm(ff, p=2, dim=1, keepdim=True)\n", + " ff = ff.div(fnorm.expand_as(ff))\n", + " #print(ff.shape)\n", + " features = torch.cat((features,ff.data.cpu().float()), 0)\n", + " #features = torch.cat((features,ff.data.float()), 0)\n", + " return features\n", + "\n", + "\n", + "def get_id(img_path):\n", + " '''\n", + " xjh: \n", + " example of the name of the img: 0769_c013_00074310_0\n", + " 0769 is the vehicleID, 013 is the cameraID, 00074310 is the frameID\n", + " '''\n", + " camera_id = []\n", + " labels = []\n", + " for path, _ in img_path:\n", + " #filename = path.split('/')[-1]\n", + " filename = os.path.basename(path) #get the name of images\n", + " # Test Gallery Image\n", + " if not 'c' in filename: \n", + " labels.append(9999999)\n", + " camera_id.append(9999999)\n", + " else:\n", + " #label = filename[0:4]\n", + " label = filename[0:5] #for benchmark_person\n", + " camera = filename.split('c')[1]\n", + " if label[0:2]=='-1':\n", + " labels.append(-1)\n", + " else:\n", + " labels.append(int(label))\n", + " #camera_id.append(int(camera[0:3]))\n", + " camera_id.append(int(camera[0:2]))#for benchmark_person\n", + " #print(camera[0:3])\n", + " return camera_id, labels\n", + "\n", + "\n", + "def test(config_file_path:str, logger):\n", + " #read config files\n", + " with open(config_file_path, encoding='utf-8') as f:\n", + " opts = yaml.load(f, Loader=yaml.SafeLoader)\n", + "\n", + " data_dir = opts['input']['dataset']['data_dir']\n", + " name = \"trained_\" + opts['input']['config']['name']\n", + " trained_model_name = name + \"_last.pth\"\n", + " save_path = OUTPUT_RESULT_DIR\n", + "\n", + " nclass = opts['input']['config']['nclass']\n", + " stride = opts['input']['config']['stride']\n", + " pool = opts['input']['config']['pool']\n", + " droprate = opts['input']['config']['droprate']\n", + " inputsize= opts['input']['config']['inputsize']\n", + " w = opts['input']['config']['w']\n", + " h = opts['input']['config']['h']\n", + " batchsize = opts['input']['config']['batchsize']\n", + " flip = opts['test']['flip_test']\n", + "\n", + " trained_model_path = os.path.join(save_path, trained_model_name)\n", + "\n", + " ##############################load model#################################################\n", + " ###self-train\n", + " model = ft_net(class_num = nclass, droprate = droprate, stride=stride, init_model=None, pool = pool, return_f=False)\n", + " \n", + " try:\n", + " model.load_state_dict(torch.load(trained_model_path))\n", + " except:\n", + " model = torch.nn.DataParallel(model)\n", + " model.load_state_dict(torch.load(trained_model_path))\n", + " model = model.module\n", + " model.classifier.classifier = nn.Sequential() #model ends with feature extractor(output len is 512)\n", + " # print(model)\n", + " \n", + " ##############################load dataset###############################################\n", + " \n", + " #transforms for input image h==w==299, inputsize==256\n", + " if h == w:\n", + " data_transforms = transforms.Compose([\n", + " transforms.Resize( ( round(inputsize*1.1), round(inputsize*1.1)), interpolation=3),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + " else:\n", + " data_transforms = transforms.Compose( [\n", + " transforms.Resize((round(h*1.1), round(w*1.1)), interpolation=3), #Image.BICUBIC\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + "\n", + " image_datasets = {x: datasets.ImageFolder( os.path.join(data_dir,x) ,data_transforms) for x in ['bounding_box_test','query']}\n", + " dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize,\n", + " shuffle=False, num_workers=8) for x in ['bounding_box_test','query']}\n", + "\n", + " #############################check GPU###################################################\n", + " use_gpu = torch.cuda.is_available()\n", + "\n", + "\n", + " #############################extract features############################################\n", + " # Change to test mode\n", + " model = model.eval()\n", + " if use_gpu:\n", + " model = model.cuda()\n", + "\n", + " gallery_path = image_datasets['bounding_box_test'].imgs\n", + " query_path = image_datasets['query'].imgs\n", + "\n", + " gallery_cam,gallery_label = get_id(gallery_path)\n", + " query_cam,query_label = get_id(query_path)\n", + "\n", + "\n", + " gallery_label = np.asarray(gallery_label)\n", + " query_label = np.asarray(query_label)\n", + " gallery_cam = np.asarray(gallery_cam)\n", + " query_cam = np.asarray(query_cam)\n", + " print('Gallery Size: %d'%len(gallery_label))\n", + " print('Query Size: %d'%len(query_label))\n", + " # Extract feature\n", + " since = time.time()\n", + " with torch.no_grad():\n", + " gallery_feature = extract_feature(model, dataloaders['bounding_box_test'], flip)\n", + " query_feature = extract_feature(model, dataloaders['query'], flip)\n", + " process_time = time.time() - since\n", + " logger.info('total forward time: %.2f minutes'%(process_time/60))\n", + " \n", + " dist = 1-torch.mm(query_feature, torch.transpose(gallery_feature, 0, 1))\n", + "\n", + " # Save to Matlab for check\n", + " extracted_feature = {'gallery_feature': gallery_feature.numpy(), 'gallery_label':gallery_label, 'gallery_cam':gallery_cam, \\\n", + " 'query_feature': query_feature.numpy(), 'query_label':query_label, 'query_cam':query_cam}\n", + "\n", + " result_name = os.path.join(save_path, name+'_feature.mat')\n", + " scipy.io.savemat(result_name, extracted_feature) \n", + "\n", + " return_dict = {}\n", + "\n", + " return_dict['dist'] = dist.numpy()\n", + " return_dict['feature_example'] = query_feature[0].numpy()\n", + " return_dict['gallery_label'] = gallery_label\n", + " return_dict['gallery_cam'] = gallery_cam\n", + " return_dict['query_label'] = query_label\n", + " return_dict['query_cam'] = query_cam\n", + "\n", + " pickle.dump(return_dict, open(OUTPUT_RESULT_DIR+'test_result.pkl', 'wb'), protocol=4)\n", + "\n", + " return \n", + "\n", + " # eval_result = evaluator(result, logger)\n", + " # full_table = display_eval_result(dict = eval_result)\n", + " # logger.info(full_table)\n", + "\n", + "if __name__==\"__main__\":\n", + " logger = get_stream_logger('TEST')\n", + " test(CONFIG_PATH, logger)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c27b171e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MindSpore", + "language": "python", + "name": "mindspore" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/c2net_listdata.py b/c2net_listdata.py new file mode 100644 index 0000000..df94803 --- /dev/null +++ b/c2net_listdata.py @@ -0,0 +1,114 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + print("--------start ls:") + os.system("cd /cache/dataset; ls -al") + print("--------end ls-----------") + diff --git a/dummyFolder/1.py b/dummyFolder/1.py new file mode 100644 index 0000000..75d9766 --- /dev/null +++ b/dummyFolder/1.py @@ -0,0 +1 @@ +print('hello world') diff --git a/gpu/pretrain.py b/gpu/pretrain.py new file mode 100755 index 0000000..191da3c --- /dev/null +++ b/gpu/pretrain.py @@ -0,0 +1,128 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +1,The dataset structure of the single-dataset in this example + MnistDataset_torch.zip + ├── test + └── train + +2,Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /dataset/train, /dataset/test; + If it is a multiple dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test; + +(2)If the pre-training model file is selected, the selected pre-training model will be +automatically placed in the /pretrainmodel directory. +for example: + If the model file is selected, the calling method is: '/pretrainmodel/' + args.pretrainmodelname + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +#获取模型文件名称 +parser.add_argument('--ckpt_url', default="", help='pretrain model path') + +# 参数声明 +WORKERS = 0 # dataloder线程数 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) +def main(): + # 如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(args.ckpt_url): + checkpoint = torch.load(args.ckpt_url) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for epoch in range(start_epoch+1, epochs): + train(model, train_loader, epoch) + test(model, test_loader, test_dataset) + # 保存模型 + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/model/mnist_epoch{}.pkl'.format(epoch)) + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + main() + + diff --git a/gpu/pretrain_for_c2net.py b/gpu/pretrain_for_c2net.py new file mode 100755 index 0000000..fba79d3 --- /dev/null +++ b/gpu/pretrain_for_c2net.py @@ -0,0 +1,144 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +In the training environment, +(1)the code will be automatically placed in the /tmp/code directory, +(2)the uploaded dataset will be automatically placed in the /tmp/dataset directory +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /tmp/dataset/train, /dataset/test; + +The dataset structure of the single dataset in the training image in this example: + tmp + ├──dataset + ├── test + └── train + +If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, +the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test +and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl +The dataset structure in the training image for multiple datasets in this example: +tmp + ├──dataset + ├── MnistDataset_torch + | ├── test + | └── train + └── checkpoint_epoch1_0.73 + ├── mnist_epoch1_0.73.pkl +(3)the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, +qizhi platform will provide file downloads under the /tmp/output directory. +(4)If the pre-training model file is selected, the selected pre-training model will be +automatically placed in the /tmp/pretrainmodel directory. +for example: + If the model file is selected, the calling method is: '/pretrainmodel/' + args.pretrainmodelname + +In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, +which is written as: +import os +os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +#获取模型文件名称 +parser.add_argument('--ckpt_url', default="", help='pretrain model path') + +# 参数声明 +WORKERS = 0 # dataloder线程数 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) +def main(): + # 如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(args.ckpt_url): + checkpoint = torch.load(args.ckpt_url) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for epoch in range(start_epoch+1, epochs): + train(model, train_loader, epoch) + test(model, test_loader, test_dataset) + # 保存模型 + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/tmp/output/mnist_epoch{}.pkl'.format(epoch)) + #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi + os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + main() + + + \ No newline at end of file diff --git a/gpu/train.py b/gpu/train.py index ccedb05..c7075ed 100755 --- a/gpu/train.py +++ b/gpu/train.py @@ -30,6 +30,7 @@ from torch.optim import SGD from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse +import datetime # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') @@ -39,10 +40,14 @@ parser.add_argument('--testdata', default="/dataset/test" ,help='path to test da parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +def gettime(): + timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return timestr + if __name__ == '__main__': args, unknown = parser.parse_known_args() #log output - print('cuda is available:{}'.format(torch.cuda.is_available())) + print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available())) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = args.batch_size train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) @@ -53,9 +58,9 @@ if __name__ == '__main__': sgd = SGD(model.parameters(), lr=1e-1) cost = CrossEntropyLoss() epoch = args.epoch_size - print('epoch_size is:{}'.format(epoch)) + print(gettime(), 'epoch_size is:{}'.format(epoch)) for _epoch in range(epoch): - print('the {} epoch_size begin'.format(_epoch + 1)) + print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1)) model.train() for idx, (train_x, train_label) in enumerate(train_loader): train_x = train_x.to(device) @@ -64,8 +69,10 @@ if __name__ == '__main__': sgd.zero_grad() predict_y = model(train_x.float()) loss = cost(predict_y, train_label.long()) - if idx % 10 == 0: - print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + #if idx % 10 == 0: + #print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + print(gettime()) + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) loss.backward() sgd.step() @@ -81,6 +88,6 @@ if __name__ == '__main__': _ = predict_ys == test_label correct += np.sum(_.numpy(), axis=-1) _sum += _.shape[0] - print('accuracy: {:.2f}'.format(correct / _sum)) + print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum)) #The model output location is placed under /model torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/gpu/train_continue.py b/gpu/train_continue.py new file mode 100755 index 0000000..91ee495 --- /dev/null +++ b/gpu/train_continue.py @@ -0,0 +1,121 @@ +##################################################################################################### +# 继续训练功能:修改训练任务时,若勾选复用上次结果,则可在新训练任务的输出路径中读取到上次结果 +# +# 示例用法 +# - 增加两个训练参数 +# 'ckpt_save_name' 此次任务的输出文件名称 +# 'ckpt_load_name' 上一次任务的输出文件名,用于加载上一次输出的模型文件名称,默认为空,则不读取任何文件 +# - 训练代码中判断 'ckpt_load_name' 是否为空,若不为空,则为继续训练任务 +##################################################################################################### + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +#获取预训练模型文件名称 +parser.add_argument('--ckpt_url', default="", help='pretrain model path') +#继续训练模型文件名称 +parser.add_argument('--ckpt_save_name', default="", help='save model name') +parser.add_argument('--ckpt_load_name', default="", help='load model name') + +# 参数声明 +WORKERS = 0 # dataloder线程数 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) + +def main(): + base_path = "/model" # 若使用智算集群则修改为 "/tmp/output" + + # 预训练模型加载,限制只在第一次任务生效,则 args.ckpt_load_name为空时 + if os.path.exists(args.ckpt_url) and not args.ckpt_load_name: + checkpoint = torch.load(args.ckpt_url) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载预训练模型 epoch {} 权重成功!'.format(start_epoch)) + # 继续训练模型加载,需要先行任务有输出文件 + elif args.ckpt_load_name: + load_path = "{}/{}.pkl".format(base_path, args.ckpt_load_name) + checkpoint = torch.load(load_path) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载继续训练 epoch {} 权重成功!'.format(start_epoch)) + else: + print('无保存模型,将从头开始训练!') + + for epoch in range(epochs): + train(model, train_loader, epoch) + test(model, test_loader, test_dataset) + # 保存模型 + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + save_path = "{}/{}.pkl".format(base_path, args.ckpt_save_name) + torch.save(state, save_path) + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + main() + + diff --git a/gpu/train_continue_c2net.py b/gpu/train_continue_c2net.py new file mode 100755 index 0000000..196981e --- /dev/null +++ b/gpu/train_continue_c2net.py @@ -0,0 +1,122 @@ +##################################################################################################### +# 继续训练功能:修改训练任务时,若勾选复用上次结果,则可在新训练任务的输出路径中读取到上次结果 +# +# 示例用法 +# - 增加两个训练参数 +# 'ckpt_save_name' 此次任务的输出文件名称 +# 'ckpt_load_name' 上一次任务的输出文件名,用于加载上一次输出的模型文件名称,默认为空,则不读取任何文件 +# - 训练代码中判断 'ckpt_load_name' 是否为空,若不为空,则为继续训练任务 +##################################################################################################### + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +#获取预训练模型文件名称 +parser.add_argument('--ckpt_url', default="", help='pretrain model path') +#继续训练模型文件名称 +parser.add_argument('--ckpt_save_name', default="", help='save model name') +parser.add_argument('--ckpt_load_name', default="", help='load model name') + +# 参数声明 +WORKERS = 0 # dataloder线程数 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) + +def main(): + base_path = "/tmp/output" # 若使用智算集群则修改为 "/tmp/output" + + # 预训练模型加载,限制只在第一次任务生效,则 args.ckpt_load_name为空时 + if os.path.exists(args.ckpt_url) and not args.ckpt_load_name: + checkpoint = torch.load(args.ckpt_url) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载预训练模型 epoch {} 权重成功!'.format(start_epoch)) + # 继续训练模型加载,需要先行任务有输出文件 + elif args.ckpt_load_name: + load_path = "{}/{}.pkl".format(base_path, args.ckpt_load_name) + checkpoint = torch.load(load_path) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载继续训练 epoch {} 权重成功!'.format(start_epoch)) + else: + print('无保存模型,将从头开始训练!') + + for epoch in range(epochs): + train(model, train_loader, epoch) + test(model, test_loader, test_dataset) + # 保存模型 + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + save_path = "{}/{}.pkl".format(base_path, args.ckpt_save_name) + torch.save(state, save_path) + os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + main() + + diff --git a/gpu/train_fail.py b/gpu/train_fail.py new file mode 100644 index 0000000..f08e5cf --- /dev/null +++ b/gpu/train_fail.py @@ -0,0 +1,2 @@ +import aaaa +print('test failure, no module') \ No newline at end of file diff --git a/gpu/train_fail2.py b/gpu/train_fail2.py new file mode 100644 index 0000000..9a60d48 --- /dev/null +++ b/gpu/train_fail2.py @@ -0,0 +1,93 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import datetime + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +def gettime(): + timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return timestr + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print(gettime(), 'epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) + print("----------this is the end--------") + print("abc" diff --git a/gpu/train_fail3.py b/gpu/train_fail3.py new file mode 100644 index 0000000..1bc70bb --- /dev/null +++ b/gpu/train_fail3.py @@ -0,0 +1,94 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import datetime + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +def gettime(): + timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return timestr + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print(gettime(), 'epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + #if idx % 10 == 0: + #print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item())) + print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) + print("----------this is the end--------") + print(a) diff --git a/gpu/train_log.py b/gpu/train_log.py new file mode 100644 index 0000000..c079c80 --- /dev/null +++ b/gpu/train_log.py @@ -0,0 +1,93 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import datetime + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +def gettime(): + timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return timestr + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print(gettime(), 'epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + + print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item())) + if idx % 10 == 0: + print("------------------") + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/gpu_new/README.md b/gpu_new/README.md new file mode 100755 index 0000000..91c4c8d --- /dev/null +++ b/gpu_new/README.md @@ -0,0 +1,112 @@ +# 如何在启智平台上进行模型训练 - GPU版本 + +- 启智集群单数据集的训练,启智集群多数据集的训练,智算集群的单数据集训练,这3个的训练使用方式不同,请注意区分: + + - 启智集群单数据集的训练示例请参考示例中[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train.py)的代码注释 + - 启智集群单数据集**加载模型**的训练示例请参考示例中[pretrain.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/pretrain.py)的代码注释 + - 启智集群多数据集的训练示例请参考示例中[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_multidataset.py)的代码注释 + - 智算集群单数据集的训练示例请参考示例中[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_c2net.py)的代码注释 + - 智算集群单数据集**加载模型**的训练示例请参考示例中[pretrain_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/pretrain_for_c2net.py)的代码注释 +- 启智集群中单数据集和多数据集的区别在于使用方式不同: + 如本示例中单数据集MNISTDataset_torch.zip的使用方式是:数据集位于/dataset/下 + 多数据集时MNISTDataset_torch.zip的使用方式是:数据集位于/dataset/MNISTDataset_torch/下 +- 智算网络中,若需要在每个epoch后都返回训练结果,可以使用回传工具将/tmp/output文件夹的内容及时传到启智以供下载,具体写法为: + + ``` + os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") + ``` + +## 1 概述 + +- 本项目以#LeNet5-MNIST-PyTorch为例,简要介绍如何在启智AI协同平台上使用Pytorch完成训练任务,包括单数据集的训练,多数据集的训练,智算网络的训练,旨在为AI开发者提供启智训练示例。 +- 用户可以直接使用本项目提供的数据集和代码文件创建自己的训练任务。 + +## 2 准备工作 + +- 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://git.openi.org.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。 + +### 2.1 数据准备 + +#### 数据集获取 + +- 如果你需要试运行本示例,则无需再次上传数据集,因为本示例中的数据集MnistDataset_torch.zip已经设置为公开数据集,可以直接引用,数据集也可从本项目的数据集目录中下载并查看数据结构,[MNISTDataset_torch.zip数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0),[mnist_epoch1_0.73.pkl.zip数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/datasets?type=0)。 +- 数据文件说明 + - MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。 + - 数据集压缩包的目录结构如下: + + > MNISTDataset_torch.zip + > ├── test + > │ └── MNIST + > │ │── raw + > │ │ ├── t10k-images-idx3-ubyte + > │ │ └── t10k-labels-idx1-ubyte + > │ │ ├── train-images-idx3-ubyte + > │ │ └── train-labels-idx1-ubyte + > │ └── processed + > │ ├── test.pt + > │ └── training.pt + > └── train + > └── MNIST + > │── raw + > │ ├── t10k-images-idx3-ubyte + > │ └── t10k-labels-idx1-ubyte + > │ ├── train-images-idx3-ubyte + > │ └── train-labels-idx1-ubyte + > └── processed + > ├── test.pt + > └── training.pt + > + + > mnist_epoch1_0.73.pkl.zip + > ├── mnist_epoch1_0.73.pkl + > + +#### 数据集上传 + +使用GPU进行训练,需要在GPU芯片上运行,所以上传的数据集需要传到GPU界面。(此步骤在本示例中不需要,可直接选择公开数据集MNISTDataset_torch.zip) + +### 2.2 执行脚本准备 + +#### 示例代码 + +- 示例代码可从本仓库中下载,[代码下载](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU) +- 代码文件说明 + - [train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train.py),用于单数据集训练的脚本文件。具体说明请参考[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train.py) + - [train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_multidataset.py),用于多数据集训练的脚本文件。具体说明请参考[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_multidataset.py) + - [train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_c2net.py),用于智算网络训练的脚本文件。具体说明请参考[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/train_for_c2net.py) + - [model.py](https://git.openi.org.cn/OpenIOSSG/MNIST_PytorchExample_GPU/src/branch/master/model.py),使用的训练网络,在单数据集训练,多数据集训练,智算网络训练中使用到。 + +## 3 创建训练任务 + +准备好数据和执行脚本以后,需要创建训练任务将Pytorch脚本运行。首次使用的用户可参考本示例代码。 + +### 训练界面示例 + +由于A100的适配性问题,A100需要使用cuda11以上的cuda版本,目前平台已提供基于A100的cuda基础镜像,只需要选择对应的公共镜像: +![avatar](Example_picture/适用A100的基础镜像.png) +训练界面参数参考如下: +![avatar](Example_picture/基础镜像.png) + +表1 创建训练作业界面参数说明 + +| 参数名称 | 说明 | +| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 计算资源 | 选择CPU/GPU | +| 代码分支 | 选择仓库代码中要使用的代码分支,默认可选择master分支 | +| 镜像 | 镜像选择已在调试环境中调试好的镜像,目前版本请选择基础镜像:平台提供基于A100的cuda基础镜像,如dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 | +| 启动文件 | 启动文件选择代码目录下的启动脚本train.py | +| 数据集 | 数据集选择已上传到启智平台的公共数据集MnistDataset_torch.zip | +| 运行参数 | 增加运行参数可以向脚本中其他参数传值,如epoch_size | +| 资源规格 | 规格选择含有GPU个数的规格 | + +## 4 查看运行结果 + +### 4.1 在训练作业界面可以查看运行日志 + +目前训练任务的日志只能在代码中print输出,参考示例train.py代码相关print + +### 4.2 训练结束后可以下载模型文件 + +![avatar](Example_picture/结果下载.png) + +## 对于示例代码有任何问题,欢迎在本项目中提issue。 diff --git a/gpu_new/inference.py b/gpu_new/inference.py new file mode 100755 index 0000000..b3ae61b --- /dev/null +++ b/gpu_new/inference.py @@ -0,0 +1,76 @@ +#!/usr/bin/python +#coding=utf-8 +''' +GPU INFERENCE INSTANCE + +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 +Due to the adaptability of a100, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the environment, the uploaded dataset will be automatically placed in the /dataset directory. +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/test; + +The model file selected is in /model directory. +The result download path is under /result . and the Qizhi platform will provide file downloads under the /result directory. + +本例中的镜像是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +选择的数据集被放置在/dataset目录 +选择的模型文件放置在/model目录 +输出结果路径是/result目录 + +!!!注意:目前推理的资源环境不支持联网,所以镜像无法使用公网镜像,镜像必须先提交到启智平台;推理的数据集也需要先上传到启智平台 + +''' + + +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import os +import argparse + + + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#获取模型文件名称 +parser.add_argument('--modelname', help='model name') + + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + test_dataset = mnist.MNIST(root='/dataset/test', train=False, transform=ToTensor(), + download=False) + test_loader = DataLoader(test_dataset, batch_size=256) + #如果文件名确定,model_path可以直接写死 + model_path = '/model/'+args.modelname + + model = torch.load(model_path).to(device) + model.eval() + + correct = 0 + _sum = 0 + + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #结果写入/result + filename = 'result.txt' + file_path = os.path.join('/result', filename) + with open(file_path, 'w') as file: + file.write('accuracy: {:.2f}'.format(correct / _sum)) \ No newline at end of file diff --git a/gpu_new/model.py b/gpu_new/model.py new file mode 100755 index 0000000..ae424a7 --- /dev/null +++ b/gpu_new/model.py @@ -0,0 +1,35 @@ +from torch.nn import Module +from torch import nn + + +class Model(Module): + def __init__(self): + super(Model, self).__init__() + self.conv1 = nn.Conv2d(1, 6, 5) + self.relu1 = nn.ReLU() + self.pool1 = nn.MaxPool2d(2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.relu2 = nn.ReLU() + self.pool2 = nn.MaxPool2d(2) + self.fc1 = nn.Linear(256, 120) + self.relu3 = nn.ReLU() + self.fc2 = nn.Linear(120, 84) + self.relu4 = nn.ReLU() + self.fc3 = nn.Linear(84, 10) + self.relu5 = nn.ReLU() + + def forward(self, x): + y = self.conv1(x) + y = self.relu1(y) + y = self.pool1(y) + y = self.conv2(y) + y = self.relu2(y) + y = self.pool2(y) + y = y.view(y.shape[0], -1) + y = self.fc1(y) + y = self.relu3(y) + y = self.fc2(y) + y = self.relu4(y) + y = self.fc3(y) + y = self.relu5(y) + return y diff --git a/gpu_new/pretrain.py b/gpu_new/pretrain.py new file mode 100755 index 0000000..fcedb1e --- /dev/null +++ b/gpu_new/pretrain.py @@ -0,0 +1,125 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +1,The dataset structure of the single-dataset in this example + MnistDataset_torch.zip + ├── test + └── train + +2,Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /dataset/train, /dataset/test; + If it is a multiple dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test; + +(2)If the pre-training model file is selected, the selected pre-training model path save as parameter ckpt_url; + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +#获取模型文件名称 +parser.add_argument('--ckpt_url', default="", help='pretrain model path') + +# 参数声明 +WORKERS = 0 # dataloder线程数 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) +def main(): + # 如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(args.ckpt_url): + checkpoint = torch.load(args.ckpt_url) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for epoch in range(start_epoch+1, epochs): + train(model, train_loader, epoch) + test(model, test_loader, test_dataset) + # 保存模型 + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/model/mnist_epoch{}.pkl'.format(epoch)) + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + main() + + diff --git a/gpu_new/pretrain_for_c2net.py b/gpu_new/pretrain_for_c2net.py new file mode 100755 index 0000000..0920f7d --- /dev/null +++ b/gpu_new/pretrain_for_c2net.py @@ -0,0 +1,141 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +In the training environment, +(1)the code will be automatically placed in the /tmp/code directory, +(2)the uploaded dataset will be automatically placed in the /tmp/dataset directory +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /tmp/dataset/train, /dataset/test; + +The dataset structure of the single dataset in the training image in this example: + tmp + ├──dataset + ├── test + └── train + +If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, +the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test +and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl +The dataset structure in the training image for multiple datasets in this example: +tmp + ├──dataset + ├── MnistDataset_torch + | ├── test + | └── train + └── checkpoint_epoch1_0.73 + ├── mnist_epoch1_0.73.pkl +(3)the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, +qizhi platform will provide file downloads under the /tmp/output directory. +(4)If the pre-training model file is selected, the selected pre-training model path save as parameter ckpt_url; + +In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, +which is written as: +import os +os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') +#获取模型文件名称 +parser.add_argument('--ckpt_url', default="", help='pretrain model path') + +# 参数声明 +WORKERS = 0 # dataloder线程数 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) +def main(): + # 如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(args.ckpt_url): + checkpoint = torch.load(args.ckpt_url) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for epoch in range(start_epoch+1, epochs): + train(model, train_loader, epoch) + test(model, test_loader, test_dataset) + # 保存模型 + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/tmp/output/mnist_epoch{}.pkl'.format(epoch)) + #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi + os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + main() + + + \ No newline at end of file diff --git a/gpu_new/test_inference.py b/gpu_new/test_inference.py new file mode 100644 index 0000000..68952e3 --- /dev/null +++ b/gpu_new/test_inference.py @@ -0,0 +1,80 @@ +#!/usr/bin/python +#coding=utf-8 +''' +GPU INFERENCE INSTANCE + +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 +Due to the adaptability of a100, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the environment, the uploaded dataset will be automatically placed in the /dataset directory. +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/test; + +The model file selected is in /model directory. +The result download path is under /result . and the Qizhi platform will provide file downloads under the /result directory. + +本例中的镜像是dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +选择的数据集被放置在/dataset目录 +选择的模型文件放置在/model目录 +输出结果路径是/result目录 + +!!!注意:目前推理的资源环境不支持联网,所以镜像无法使用公网镜像,镜像必须先提交到启智平台;推理的数据集也需要先上传到启智平台 + +''' + + +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import os +import argparse +from model import Model + + + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#获取模型文件名称 +parser.add_argument('--modelname', help='model name') + + + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + test_dataset = mnist.MNIST(root='/dataset/test', train=False, transform=ToTensor(), + download=False) + test_loader = DataLoader(test_dataset, batch_size=256) + #如果文件名确定,model_path可以直接写死 + model_path = '/model/'+args.modelname + + model = Model().to(device) + checkpoint = torch.load(model_path) + model.load_state_dict(checkpoint['model']) + + model.eval() + + correct = 0 + _sum = 0 + + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #结果写入/result + filename = 'result.txt' + file_path = os.path.join('/result', filename) + with open(file_path, 'w') as file: + file.write('accuracy: {:.2f}'.format(correct / _sum)) \ No newline at end of file diff --git a/gpu_new/train.py b/gpu_new/train.py new file mode 100755 index 0000000..6912eb9 --- /dev/null +++ b/gpu_new/train.py @@ -0,0 +1,92 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +# 参数声明 +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +model = Model().to(device) +optimizer = SGD(model.parameters(), lr=1e-1) + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print('epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + #print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/gpu_new/train_for_c2net.py b/gpu_new/train_for_c2net.py new file mode 100755 index 0000000..90f3f48 --- /dev/null +++ b/gpu_new/train_for_c2net.py @@ -0,0 +1,111 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +In the training environment, +the code will be automatically placed in the /tmp/code directory, +the uploaded dataset will be automatically placed in the /tmp/dataset directory + +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /tmp/dataset/train, /dataset/test; + +The dataset structure of the single dataset in the training image in this example: + tmp + ├──dataset + ├── test + └── train + +If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, +the dataset directory is /tmp/dataset/MnistDataset_torch/train, /tmp/dataset/MnistDataset_torch/test +and /tmp/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl +The dataset structure in the training image for multiple datasets in this example: +tmp + ├──dataset + ├── MnistDataset_torch + | ├── test + | └── train + └── checkpoint_epoch1_0.73 + ├── mnist_epoch1_0.73.pkl + + +the model download path is under /tmp/output by default, please specify the model output location to /tmp/output, +qizhi platform will provide file downloads under the /tmp/output directory. + +In addition, if you want to get the model file after each training, you can call the uploader_for_gpu tool, +which is written as: +import os +os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/tmp/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/tmp/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print('epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '/tmp/output/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) + #After calling uploader_for_gpu, after each epoch training, the result file under /tmp/output will be sent back to Qizhi + os.system("cd /tmp/script_for_grampus/ &&./uploader_for_gpu " + "/tmp/output/") \ No newline at end of file diff --git a/gpu_new/train_for_multidataset.py b/gpu_new/train_for_multidataset.py new file mode 100755 index 0000000..08cc270 --- /dev/null +++ b/gpu_new/train_for_multidataset.py @@ -0,0 +1,113 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +1,The dataset structure of the multi-dataset in this example + MnistDataset_torch.zip + ├── test + └── train + + checkpoint_epoch1_0.73.zip + ├── mnist_epoch1_0.73.pkl + +2,Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +Note: the paths are different when selecting a single dataset and multiple datasets. +(1)If it is a single dataset: if MnistDataset_torch.zip is selected, + the dataset directory is /dataset/train, /dataset/test; + +The dataset structure of the single dataset in the training image in this example: + dataset + ├── test + └── train +(2)If multiple datasets are selected, such as MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The dataset structure in the training image for multiple datasets in this example: + dataset + ├── MnistDataset_torch + | ├── test + | └── train + └── checkpoint_epoch1_0.73 + ├── mnist_epoch1_0.73.pkl + + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/MnistDataset_torch/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/MnistDataset_torch/test" ,help='path to test dataset') +parser.add_argument('--checkpoint', default="/dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl" ,help='checkpoint file') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print('epoch_size is:{}'.format(epoch)) + # Load the trained model + # path = args.checkpoint + # checkpoint = torch.load(path, map_location=device) + # model.load_state_dict(checkpoint) + for _epoch in range(epoch): + print('the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print('idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print('accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) \ No newline at end of file diff --git a/notebooks/1.md b/notebooks/1.md new file mode 100755 index 0000000..b6fc4c6 --- /dev/null +++ b/notebooks/1.md @@ -0,0 +1 @@ +hello \ No newline at end of file diff --git a/notebooks/testwj.ipynb b/notebooks/testwj.ipynb new file mode 100755 index 0000000..2df6b91 --- /dev/null +++ b/notebooks/testwj.ipynb @@ -0,0 +1,35 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "14e5e20a", + "metadata": {}, + "outputs": [], + "source": [ + "print('hello world')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MindSpore", + "language": "python", + "name": "mindspore" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/test测试/1.txt b/notebooks/test测试/1.txt new file mode 100755 index 0000000..65e38bf --- /dev/null +++ b/notebooks/test测试/1.txt @@ -0,0 +1 @@ +well \ No newline at end of file diff --git a/notebooks/test测试/wj-Untitled.ipynb b/notebooks/test测试/wj-Untitled.ipynb new file mode 100755 index 0000000..a42c026 --- /dev/null +++ b/notebooks/test测试/wj-Untitled.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "90e7b1d4", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'torch'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'" + ] + } + ], + "source": [ + "# -*- coding: utf-8 -*-\n", + "from __future__ import print_function, division\n", + "\n", + "# import sys\n", + "# sys.path.append('/home/xujiahong/openI_benchmark/vechicle_reID_VechicleNet/')\n", + "\n", + "import time\n", + "import yaml\n", + "import pickle\n", + "import torch\n", + "import torch.nn as nn\n", + "import numpy as np\n", + "from torchvision import datasets,transforms\n", + "import os\n", + "import scipy.io\n", + "from tqdm import tqdm\n", + "from data_utils.model_train import ft_net\n", + "from utils.util import get_stream_logger\n", + "from config.mainconfig import OUTPUT_RESULT_DIR, CONFIG_PATH\n", + "\n", + "\n", + "\n", + "def fliplr(img):\n", + " '''flip horizontal'''\n", + " inv_idx = torch.arange(img.size(3)-1,-1,-1).long() # N x C x H x W\n", + " img_flip = img.index_select(3,inv_idx)\n", + " return img_flip\n", + "\n", + "def extract_feature(model, dataloaders, flip):\n", + " features = torch.FloatTensor()\n", + " count = 0\n", + " for _, data in enumerate(tqdm(dataloaders),0):\n", + " img, _ = data\n", + " n, c, h, w = img.size()\n", + " count += n\n", + "\n", + " input_img = img.cuda()\n", + " ff = model(input_img)\n", + "\n", + " if flip:\n", + " img = fliplr(img)\n", + " input_img = img.cuda()\n", + " outputs_flip = model(input_img)\n", + " ff += outputs_flip\n", + "\n", + " fnorm = torch.norm(ff, p=2, dim=1, keepdim=True)\n", + " ff = ff.div(fnorm.expand_as(ff))\n", + " #print(ff.shape)\n", + " features = torch.cat((features,ff.data.cpu().float()), 0)\n", + " #features = torch.cat((features,ff.data.float()), 0)\n", + " return features\n", + "\n", + "\n", + "def get_id(img_path):\n", + " '''\n", + " xjh: \n", + " example of the name of the img: 0769_c013_00074310_0\n", + " 0769 is the vehicleID, 013 is the cameraID, 00074310 is the frameID\n", + " '''\n", + " camera_id = []\n", + " labels = []\n", + " for path, _ in img_path:\n", + " #filename = path.split('/')[-1]\n", + " filename = os.path.basename(path) #get the name of images\n", + " # Test Gallery Image\n", + " if not 'c' in filename: \n", + " labels.append(9999999)\n", + " camera_id.append(9999999)\n", + " else:\n", + " #label = filename[0:4]\n", + " label = filename[0:5] #for benchmark_person\n", + " camera = filename.split('c')[1]\n", + " if label[0:2]=='-1':\n", + " labels.append(-1)\n", + " else:\n", + " labels.append(int(label))\n", + " #camera_id.append(int(camera[0:3]))\n", + " camera_id.append(int(camera[0:2]))#for benchmark_person\n", + " #print(camera[0:3])\n", + " return camera_id, labels\n", + "\n", + "\n", + "def test(config_file_path:str, logger):\n", + " #read config files\n", + " with open(config_file_path, encoding='utf-8') as f:\n", + " opts = yaml.load(f, Loader=yaml.SafeLoader)\n", + "\n", + " data_dir = opts['input']['dataset']['data_dir']\n", + " name = \"trained_\" + opts['input']['config']['name']\n", + " trained_model_name = name + \"_last.pth\"\n", + " save_path = OUTPUT_RESULT_DIR\n", + "\n", + " nclass = opts['input']['config']['nclass']\n", + " stride = opts['input']['config']['stride']\n", + " pool = opts['input']['config']['pool']\n", + " droprate = opts['input']['config']['droprate']\n", + " inputsize= opts['input']['config']['inputsize']\n", + " w = opts['input']['config']['w']\n", + " h = opts['input']['config']['h']\n", + " batchsize = opts['input']['config']['batchsize']\n", + " flip = opts['test']['flip_test']\n", + "\n", + " trained_model_path = os.path.join(save_path, trained_model_name)\n", + "\n", + " ##############################load model#################################################\n", + " ###self-train\n", + " model = ft_net(class_num = nclass, droprate = droprate, stride=stride, init_model=None, pool = pool, return_f=False)\n", + " \n", + " try:\n", + " model.load_state_dict(torch.load(trained_model_path))\n", + " except:\n", + " model = torch.nn.DataParallel(model)\n", + " model.load_state_dict(torch.load(trained_model_path))\n", + " model = model.module\n", + " model.classifier.classifier = nn.Sequential() #model ends with feature extractor(output len is 512)\n", + " # print(model)\n", + " \n", + " ##############################load dataset###############################################\n", + " \n", + " #transforms for input image h==w==299, inputsize==256\n", + " if h == w:\n", + " data_transforms = transforms.Compose([\n", + " transforms.Resize( ( round(inputsize*1.1), round(inputsize*1.1)), interpolation=3),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + " else:\n", + " data_transforms = transforms.Compose( [\n", + " transforms.Resize((round(h*1.1), round(w*1.1)), interpolation=3), #Image.BICUBIC\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + "\n", + " image_datasets = {x: datasets.ImageFolder( os.path.join(data_dir,x) ,data_transforms) for x in ['bounding_box_test','query']}\n", + " dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize,\n", + " shuffle=False, num_workers=8) for x in ['bounding_box_test','query']}\n", + "\n", + " #############################check GPU###################################################\n", + " use_gpu = torch.cuda.is_available()\n", + "\n", + "\n", + " #############################extract features############################################\n", + " # Change to test mode\n", + " model = model.eval()\n", + " if use_gpu:\n", + " model = model.cuda()\n", + "\n", + " gallery_path = image_datasets['bounding_box_test'].imgs\n", + " query_path = image_datasets['query'].imgs\n", + "\n", + " gallery_cam,gallery_label = get_id(gallery_path)\n", + " query_cam,query_label = get_id(query_path)\n", + "\n", + "\n", + " gallery_label = np.asarray(gallery_label)\n", + " query_label = np.asarray(query_label)\n", + " gallery_cam = np.asarray(gallery_cam)\n", + " query_cam = np.asarray(query_cam)\n", + " print('Gallery Size: %d'%len(gallery_label))\n", + " print('Query Size: %d'%len(query_label))\n", + " # Extract feature\n", + " since = time.time()\n", + " with torch.no_grad():\n", + " gallery_feature = extract_feature(model, dataloaders['bounding_box_test'], flip)\n", + " query_feature = extract_feature(model, dataloaders['query'], flip)\n", + " process_time = time.time() - since\n", + " logger.info('total forward time: %.2f minutes'%(process_time/60))\n", + " \n", + " dist = 1-torch.mm(query_feature, torch.transpose(gallery_feature, 0, 1))\n", + "\n", + " # Save to Matlab for check\n", + " extracted_feature = {'gallery_feature': gallery_feature.numpy(), 'gallery_label':gallery_label, 'gallery_cam':gallery_cam, \\\n", + " 'query_feature': query_feature.numpy(), 'query_label':query_label, 'query_cam':query_cam}\n", + "\n", + " result_name = os.path.join(save_path, name+'_feature.mat')\n", + " scipy.io.savemat(result_name, extracted_feature) \n", + "\n", + " return_dict = {}\n", + "\n", + " return_dict['dist'] = dist.numpy()\n", + " return_dict['feature_example'] = query_feature[0].numpy()\n", + " return_dict['gallery_label'] = gallery_label\n", + " return_dict['gallery_cam'] = gallery_cam\n", + " return_dict['query_label'] = query_label\n", + " return_dict['query_cam'] = query_cam\n", + "\n", + " pickle.dump(return_dict, open(OUTPUT_RESULT_DIR+'test_result.pkl', 'wb'), protocol=4)\n", + "\n", + " return \n", + "\n", + " # eval_result = evaluator(result, logger)\n", + " # full_table = display_eval_result(dict = eval_result)\n", + " # logger.info(full_table)\n", + "\n", + "if __name__==\"__main__\":\n", + " logger = get_stream_logger('TEST')\n", + " test(CONFIG_PATH, logger)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c27b171e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MindSpore", + "language": "python", + "name": "mindspore" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/wj-Untitled.ipynb b/notebooks/wj-Untitled.ipynb new file mode 100755 index 0000000..a42c026 --- /dev/null +++ b/notebooks/wj-Untitled.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "90e7b1d4", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'torch'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0myaml\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'torch'" + ] + } + ], + "source": [ + "# -*- coding: utf-8 -*-\n", + "from __future__ import print_function, division\n", + "\n", + "# import sys\n", + "# sys.path.append('/home/xujiahong/openI_benchmark/vechicle_reID_VechicleNet/')\n", + "\n", + "import time\n", + "import yaml\n", + "import pickle\n", + "import torch\n", + "import torch.nn as nn\n", + "import numpy as np\n", + "from torchvision import datasets,transforms\n", + "import os\n", + "import scipy.io\n", + "from tqdm import tqdm\n", + "from data_utils.model_train import ft_net\n", + "from utils.util import get_stream_logger\n", + "from config.mainconfig import OUTPUT_RESULT_DIR, CONFIG_PATH\n", + "\n", + "\n", + "\n", + "def fliplr(img):\n", + " '''flip horizontal'''\n", + " inv_idx = torch.arange(img.size(3)-1,-1,-1).long() # N x C x H x W\n", + " img_flip = img.index_select(3,inv_idx)\n", + " return img_flip\n", + "\n", + "def extract_feature(model, dataloaders, flip):\n", + " features = torch.FloatTensor()\n", + " count = 0\n", + " for _, data in enumerate(tqdm(dataloaders),0):\n", + " img, _ = data\n", + " n, c, h, w = img.size()\n", + " count += n\n", + "\n", + " input_img = img.cuda()\n", + " ff = model(input_img)\n", + "\n", + " if flip:\n", + " img = fliplr(img)\n", + " input_img = img.cuda()\n", + " outputs_flip = model(input_img)\n", + " ff += outputs_flip\n", + "\n", + " fnorm = torch.norm(ff, p=2, dim=1, keepdim=True)\n", + " ff = ff.div(fnorm.expand_as(ff))\n", + " #print(ff.shape)\n", + " features = torch.cat((features,ff.data.cpu().float()), 0)\n", + " #features = torch.cat((features,ff.data.float()), 0)\n", + " return features\n", + "\n", + "\n", + "def get_id(img_path):\n", + " '''\n", + " xjh: \n", + " example of the name of the img: 0769_c013_00074310_0\n", + " 0769 is the vehicleID, 013 is the cameraID, 00074310 is the frameID\n", + " '''\n", + " camera_id = []\n", + " labels = []\n", + " for path, _ in img_path:\n", + " #filename = path.split('/')[-1]\n", + " filename = os.path.basename(path) #get the name of images\n", + " # Test Gallery Image\n", + " if not 'c' in filename: \n", + " labels.append(9999999)\n", + " camera_id.append(9999999)\n", + " else:\n", + " #label = filename[0:4]\n", + " label = filename[0:5] #for benchmark_person\n", + " camera = filename.split('c')[1]\n", + " if label[0:2]=='-1':\n", + " labels.append(-1)\n", + " else:\n", + " labels.append(int(label))\n", + " #camera_id.append(int(camera[0:3]))\n", + " camera_id.append(int(camera[0:2]))#for benchmark_person\n", + " #print(camera[0:3])\n", + " return camera_id, labels\n", + "\n", + "\n", + "def test(config_file_path:str, logger):\n", + " #read config files\n", + " with open(config_file_path, encoding='utf-8') as f:\n", + " opts = yaml.load(f, Loader=yaml.SafeLoader)\n", + "\n", + " data_dir = opts['input']['dataset']['data_dir']\n", + " name = \"trained_\" + opts['input']['config']['name']\n", + " trained_model_name = name + \"_last.pth\"\n", + " save_path = OUTPUT_RESULT_DIR\n", + "\n", + " nclass = opts['input']['config']['nclass']\n", + " stride = opts['input']['config']['stride']\n", + " pool = opts['input']['config']['pool']\n", + " droprate = opts['input']['config']['droprate']\n", + " inputsize= opts['input']['config']['inputsize']\n", + " w = opts['input']['config']['w']\n", + " h = opts['input']['config']['h']\n", + " batchsize = opts['input']['config']['batchsize']\n", + " flip = opts['test']['flip_test']\n", + "\n", + " trained_model_path = os.path.join(save_path, trained_model_name)\n", + "\n", + " ##############################load model#################################################\n", + " ###self-train\n", + " model = ft_net(class_num = nclass, droprate = droprate, stride=stride, init_model=None, pool = pool, return_f=False)\n", + " \n", + " try:\n", + " model.load_state_dict(torch.load(trained_model_path))\n", + " except:\n", + " model = torch.nn.DataParallel(model)\n", + " model.load_state_dict(torch.load(trained_model_path))\n", + " model = model.module\n", + " model.classifier.classifier = nn.Sequential() #model ends with feature extractor(output len is 512)\n", + " # print(model)\n", + " \n", + " ##############################load dataset###############################################\n", + " \n", + " #transforms for input image h==w==299, inputsize==256\n", + " if h == w:\n", + " data_transforms = transforms.Compose([\n", + " transforms.Resize( ( round(inputsize*1.1), round(inputsize*1.1)), interpolation=3),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + " else:\n", + " data_transforms = transforms.Compose( [\n", + " transforms.Resize((round(h*1.1), round(w*1.1)), interpolation=3), #Image.BICUBIC\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n", + " ])\n", + "\n", + " image_datasets = {x: datasets.ImageFolder( os.path.join(data_dir,x) ,data_transforms) for x in ['bounding_box_test','query']}\n", + " dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batchsize,\n", + " shuffle=False, num_workers=8) for x in ['bounding_box_test','query']}\n", + "\n", + " #############################check GPU###################################################\n", + " use_gpu = torch.cuda.is_available()\n", + "\n", + "\n", + " #############################extract features############################################\n", + " # Change to test mode\n", + " model = model.eval()\n", + " if use_gpu:\n", + " model = model.cuda()\n", + "\n", + " gallery_path = image_datasets['bounding_box_test'].imgs\n", + " query_path = image_datasets['query'].imgs\n", + "\n", + " gallery_cam,gallery_label = get_id(gallery_path)\n", + " query_cam,query_label = get_id(query_path)\n", + "\n", + "\n", + " gallery_label = np.asarray(gallery_label)\n", + " query_label = np.asarray(query_label)\n", + " gallery_cam = np.asarray(gallery_cam)\n", + " query_cam = np.asarray(query_cam)\n", + " print('Gallery Size: %d'%len(gallery_label))\n", + " print('Query Size: %d'%len(query_label))\n", + " # Extract feature\n", + " since = time.time()\n", + " with torch.no_grad():\n", + " gallery_feature = extract_feature(model, dataloaders['bounding_box_test'], flip)\n", + " query_feature = extract_feature(model, dataloaders['query'], flip)\n", + " process_time = time.time() - since\n", + " logger.info('total forward time: %.2f minutes'%(process_time/60))\n", + " \n", + " dist = 1-torch.mm(query_feature, torch.transpose(gallery_feature, 0, 1))\n", + "\n", + " # Save to Matlab for check\n", + " extracted_feature = {'gallery_feature': gallery_feature.numpy(), 'gallery_label':gallery_label, 'gallery_cam':gallery_cam, \\\n", + " 'query_feature': query_feature.numpy(), 'query_label':query_label, 'query_cam':query_cam}\n", + "\n", + " result_name = os.path.join(save_path, name+'_feature.mat')\n", + " scipy.io.savemat(result_name, extracted_feature) \n", + "\n", + " return_dict = {}\n", + "\n", + " return_dict['dist'] = dist.numpy()\n", + " return_dict['feature_example'] = query_feature[0].numpy()\n", + " return_dict['gallery_label'] = gallery_label\n", + " return_dict['gallery_cam'] = gallery_cam\n", + " return_dict['query_label'] = query_label\n", + " return_dict['query_cam'] = query_cam\n", + "\n", + " pickle.dump(return_dict, open(OUTPUT_RESULT_DIR+'test_result.pkl', 'wb'), protocol=4)\n", + "\n", + " return \n", + "\n", + " # eval_result = evaluator(result, logger)\n", + " # full_table = display_eval_result(dict = eval_result)\n", + " # logger.info(full_table)\n", + "\n", + "if __name__==\"__main__\":\n", + " logger = get_stream_logger('TEST')\n", + " test(CONFIG_PATH, logger)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c27b171e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MindSpore", + "language": "python", + "name": "mindspore" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/npu/lewis/c2net_npu.py b/npu/lewis/c2net_npu.py new file mode 100755 index 0000000..9f33cce --- /dev/null +++ b/npu/lewis/c2net_npu.py @@ -0,0 +1,149 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + diff --git a/npu/lewis/c2net_npu_continue.py b/npu/lewis/c2net_npu_continue.py new file mode 100755 index 0000000..d17c47d --- /dev/null +++ b/npu/lewis/c2net_npu_continue.py @@ -0,0 +1,196 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image and unzip### +def C2netMultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"] + try: + mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path)) + #get filename and unzip the dataset + filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0] + filePath = data_dir + "/" + filename + if not os.path.exists(filePath): + os.makedirs(filePath) + os.system("unzip {} -d {}".format(zipfile_path, filePath)) + + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], zipfile_path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +### Copy the output model to obs ### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + C2netMultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + C2netMultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + + + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='path to multi dataset', + default= '/cache/data/') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= '/cache/output/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +### continue task parameters +parser.add_argument('--ckpt_load_name', + help='model name to load', + default= '') + +parser.add_argument('--ckpt_save_name', + help='model name to save', + default= 'checkpoint') + + + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + ### 继续训练模型加载 + if args.ckpt_load_name: + C2netMultiObsToEnv(args.train_url, train_dir) + load_path = "{}/{}.ckpt".format(train_dir, args.ckpt_load_name) + load_param_into_net(network, load_checkpoint(load_path)) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name, + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + diff --git a/npu/lewis/c2net_npu_multi_dataset.py b/npu/lewis/c2net_npu_multi_dataset.py new file mode 100755 index 0000000..fd36194 --- /dev/null +++ b/npu/lewis/c2net_npu_multi_dataset.py @@ -0,0 +1,197 @@ +""" +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + └── train + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Copy multi-dataset from obs to training image +function MultiObsToEnv(multi_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(2)Download the input from Qizhi And Init +function DownloadFromQizhi(multi_data_url, data_dir) + +(2)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +3、4 parameters need to be defined +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +data_dir + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The dataset path is used here:data_dir + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" + load_param_into_net(network, load_checkpoint(os.path.join(data_dir + "/checkpoint_lenet-1_1875", + "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + diff --git a/npu/lewis/c2net_npu_pretrain.py b/npu/lewis/c2net_npu_pretrain.py new file mode 100755 index 0000000..7ab8af8 --- /dev/null +++ b/npu/lewis/c2net_npu_pretrain.py @@ -0,0 +1,233 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + └── train + + +2、Single dataset training requires predefined functions +(1)Copy single dataset from obs to training image +function ObsToEnv(obs_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(3)Download the input from Qizhi And Init +function DownloadFromQizhi(obs_data_url, data_dir) + +(4)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +(5)Copy ckpt file from obs to training image. +function ObsUrlToEnv(obs_ckpt_url, ckpt_url) + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +5、How to load the checkpoint file +The checkpoint file is loaded by the ckpt_url parameter + +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import mindspore.ops as ops +import time +import json +#from upload import UploadOutput + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy ckpt file from obs to training image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + ###Copy ckpt file from obs to training image + ObsUrlToEnv(args.ckpt_url, ckpt_url) + ###Copy data from obs to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir+ "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir+ "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The ckpt path is used here:ckpt_url + print('-------ckpt_url is:', args.ckpt_url) + load_param_into_net(network, load_checkpoint(ckpt_url)) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) \ No newline at end of file diff --git a/npu/lewis/c2net_testbigfile.py b/npu/lewis/c2net_testbigfile.py new file mode 100755 index 0000000..df94803 --- /dev/null +++ b/npu/lewis/c2net_testbigfile.py @@ -0,0 +1,114 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + print("--------start ls:") + os.system("cd /cache/dataset; ls -al") + print("--------end ls-----------") + diff --git a/npu/lewis/config.py b/npu/lewis/config.py new file mode 100755 index 0000000..22d68e2 --- /dev/null +++ b/npu/lewis/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 150, + 'air_name': "lenet", +}) diff --git a/npu/lewis/dataset.py b/npu/lewis/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/npu/lewis/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/lewis/dataset_distributed.py b/npu/lewis/dataset_distributed.py new file mode 100755 index 0000000..d813078 --- /dev/null +++ b/npu/lewis/dataset_distributed.py @@ -0,0 +1,55 @@ + +""" +Produce the dataset: +与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: +get_rank:获取当前设备在集群中的ID。 +get_group_size:获取集群数量。 + +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype +from mindspore.communication.management import init, get_rank, get_group_size + + +def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1, shard_id=0, num_shards=8): + """ + create dataset for train or test + """ + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. + shard_id = get_rank() + num_shards = get_group_size() + # define dataset + mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/lewis/lenet.py b/npu/lewis/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/npu/lewis/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/npu/pretrain.py b/npu/pretrain.py new file mode 100755 index 0000000..0d4d30d --- /dev/null +++ b/npu/pretrain.py @@ -0,0 +1,231 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + └── train + + +2、Single dataset training requires predefined functions +(1)Copy single dataset from obs to training image +function ObsToEnv(obs_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(3)Download the input from Qizhi And Init +function DownloadFromQizhi(obs_data_url, data_dir) + +(4)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +(5)Copy ckpt file from obs to training image. +function ObsUrlToEnv(obs_ckpt_url, ckpt_url) + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +5、How to load the checkpoint file +The checkpoint file is loaded by the ckpt_url parameter + +In addition, if you want to get the model file after each training, you can call the UploadOutput. +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import mindspore.ops as ops +import time +from upload import UploadOutput + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy ckpt file from obs to training image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return +### Copy the output to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return +def DownloadFromQizhi(obs_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ObsToEnv(obs_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + ObsToEnv(obs_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= '/cache/data/') + +parser.add_argument('--train_url', + help='output folder to save/load', + default= '/cache/output/') +parser.add_argument('--ckpt_url', + help='model to save/load', + default= '/cache/checkpoint.ckpt') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/data' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + ###Copy ckpt file from obs to training image + ObsUrlToEnv(args.ckpt_url, ckpt_url) + ###Copy data from obs to training image + DownloadFromQizhi(args.data_url, data_dir) + ###The dataset path is used here:data_dir +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir, "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The ckpt path is used here:ckpt_url + load_param_into_net(network, load_checkpoint(ckpt_url)) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + #Custom callback, upload output after each epoch + uploadOutput = UploadOutput(train_dir,args.train_url) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor(), uploadOutput]) + + ###Copy the trained output data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + #This step is not required if UploadOutput is called + UploadToQizhi(train_dir,args.train_url) \ No newline at end of file diff --git a/npu/pretrain_for_c2net.py b/npu/pretrain_for_c2net.py new file mode 100755 index 0000000..3b7318c --- /dev/null +++ b/npu/pretrain_for_c2net.py @@ -0,0 +1,245 @@ +""" +######################## Attention! ######################## +智算网络需要在代码里使用mox拷贝数据集并解压,请参考函数C2netMultiObsToEnv +The intelligent computing network needs to use mox to copy the dataset and decompress it in the code, +please refer to the function C2netMultiObsToEnv() + +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + └── train + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Copy multi-dataset from obs to training image and unzip +function C2netMultiObsToEnv(multi_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(2)Download the input from Qizhi And Init +function DownloadFromQizhi(multi_data_url, data_dir) + +(2)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +3、4 parameters need to be defined +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--multi_data_url,--train_url,--device_target,These 3 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +data_dir + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image and unzip### +def C2netMultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"] + try: + mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path)) + #get filename and unzip the dataset + filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0] + filePath = data_dir + "/" + filename + if not os.path.exists(filePath): + os.makedirs(filePath) + os.system("unzip {} -d {}".format(zipfile_path, filePath)) + + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], zipfile_path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy ckpt file from obs to training image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return +### Copy the output model to obs ### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + C2netMultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + C2netMultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--train_url,--device_target,These 3 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='path to multi dataset', + default= '/cache/data/') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= '/cache/output/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/data' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Copy ckpt file from obs to training image + ObsUrlToEnv(args.ckpt_url, ckpt_url) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The ckpt path is used here:ckpt_url + load_param_into_net(network, load_checkpoint(ckpt_url)) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + # 测试代码。结果回传 + os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/code/") + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + ###Copy the trained output data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + UploadToQizhi(train_dir,args.train_url) + diff --git a/npu/testpaddle.py b/npu/testpaddle.py new file mode 100644 index 0000000..26b0b1b --- /dev/null +++ b/npu/testpaddle.py @@ -0,0 +1,2 @@ +import paddle; +print(-------"test, paddle version is:"+ paddle.__version__) \ No newline at end of file diff --git a/npu/train_continue.py b/npu/train_continue.py new file mode 100755 index 0000000..81f2d4a --- /dev/null +++ b/npu/train_continue.py @@ -0,0 +1,199 @@ +##################################################################################################### +# 继续训练功能:修改训练任务时,若勾选复用上次结果,则可在新训练任务的输出路径中读取到上次结果 +# +# 示例用法 +# - 增加两个训练参数 +# 'ckpt_save_name' 此次任务的输出文件名称 +# 'ckpt_load_name' 上一次任务的输出文件名,用于加载上一次输出的模型文件名称,默认为空,则不读取任何文件 +# - 训练代码中判断 'ckpt_load_name' 是否为空,若不为空,则为继续训练任务 +##################################################################################################### + + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import mindspore.ops as ops +import time +from upload import UploadOutput + +### Copy single file from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +### Copy the output to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +def DownloadFromQizhi(obs_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ObsToEnv(obs_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + ObsToEnv(obs_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= '/cache/data/') + +parser.add_argument('--train_url', + help='output folder to save/load', + default= '/cache/output/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +### continue task parameters +parser.add_argument('--ckpt_load_name', + help='model name to load', + default= '') + +parser.add_argument('--ckpt_save_name', + help='model name to save', + default= 'checkpoint') + + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/data' + base_path = '/cache/output' + + try: + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(base_path): + os.makedirs(base_path) + except Exception as e: + print("path already exists") + + ###Initialize and copy data to training image + ###Copy data from obs to training image + DownloadFromQizhi(args.data_url, data_dir) + ###The dataset path is used here:data_dir +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir, "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + ### 继续训练模型加载 + if args.ckpt_load_name: + ObsToEnv(args.train_url, base_path) + load_path = "{}/{}.ckpt".format(base_path,args.ckpt_load_name) + load_param_into_net(network, load_checkpoint(load_path)) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=1) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + save_path = base_path + "/" + if device_num > 1: + save_path = base_path + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix=args.ckpt_save_name, + directory=save_path, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + #Custom callback, upload output after each epoch + uploadOutput = UploadOutput(base_path,args.train_url) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor(), uploadOutput]) + + ###Copy the trained output data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + #This step is not required if UploadOutput is called + UploadToQizhi(base_path,args.train_url) \ No newline at end of file diff --git a/npu/train_for_c2net_testcopy.py b/npu/train_for_c2net_testcopy.py new file mode 100644 index 0000000..52a7b62 --- /dev/null +++ b/npu/train_for_c2net_testcopy.py @@ -0,0 +1,92 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +set_seed(1) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + train_dir = '/cache/output' + data_dir = '/cache/dataset' + + #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #创建数据集 + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + #创建网络 + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #定义模型输出路径 + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + #开始训练 + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + # 测试代码。结果回传 + os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/code/") + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + print("============== Finish Training ==============") \ No newline at end of file diff --git a/npu/train_for_c2net_testcopy2.py b/npu/train_for_c2net_testcopy2.py new file mode 100644 index 0000000..6a0ee7e --- /dev/null +++ b/npu/train_for_c2net_testcopy2.py @@ -0,0 +1,92 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + +import os +import argparse +from config import mnist_cfg as cfg +from dataset import create_dataset +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.common import set_seed + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: CPU),若要在启智平台上使用NPU,需要在启智平台训练界面上加上运行参数device_target=Ascend') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +set_seed(1) + +if __name__ == "__main__": + args = parser.parse_args() + print('args:') + print(args) + + train_dir = '/cache/output' + data_dir = '/cache/dataset' + + #注意:这里很重要,指定了训练所用的设备CPU还是Ascend NPU + context.set_context(mode=context.GRAPH_MODE, + device_target=args.device_target) + #创建数据集 + ds_train = create_dataset(os.path.join(data_dir, "train"), + cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + #创建网络 + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #定义模型输出路径 + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=train_dir, + config=config_ck) + #开始训练 + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + # 测试代码。结果回传 + os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/code") + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + print("============== Finish Training ==============") \ No newline at end of file diff --git a/npu/upload.py b/npu/upload.py new file mode 100644 index 0000000..6060ab2 --- /dev/null +++ b/npu/upload.py @@ -0,0 +1,14 @@ +from mindspore.train.callback import Callback +import moxing as mox + +class UploadOutput(Callback): + def __init__(self, train_dir, obs_train_url): + self.train_dir = train_dir + self.obs_train_url = obs_train_url + def epoch_end(self,run_context): + try: + mox.file.copy_parallel(self.train_dir , self.obs_train_url ) + print("Successfully Upload {} to {}".format(self.train_dir ,self.obs_train_url )) + except Exception as e: + print('moxing upload {} to {} failed: '.format(self.train_dir ,self.obs_train_url ) + str(e)) + return diff --git a/npu/upload_for_c2net.py b/npu/upload_for_c2net.py new file mode 100644 index 0000000..b725724 --- /dev/null +++ b/npu/upload_for_c2net.py @@ -0,0 +1,6 @@ +from mindspore.train.callback import Callback +import os + +class UploadOutput(Callback): + def epoch_end(self,run_context): + os.system("cd /cache/script_for_grampus/ &&./uploader_for_npu " + "/cache/output/") diff --git a/npu_multiNode/README.md b/npu_multiNode/README.md new file mode 100755 index 0000000..298794f --- /dev/null +++ b/npu_multiNode/README.md @@ -0,0 +1,2 @@ +# MNIST_PytorchExample_npu_multiNode + diff --git a/npu_multiNode/config.py b/npu_multiNode/config.py new file mode 100755 index 0000000..22d68e2 --- /dev/null +++ b/npu_multiNode/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 150, + 'air_name': "lenet", +}) diff --git a/npu_multiNode/dataset.py b/npu_multiNode/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/npu_multiNode/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu_multiNode/dataset_distributed.py b/npu_multiNode/dataset_distributed.py new file mode 100755 index 0000000..66cca60 --- /dev/null +++ b/npu_multiNode/dataset_distributed.py @@ -0,0 +1,54 @@ + +""" +Produce the dataset: +与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: +get_rank:获取当前设备在集群中的ID。 +get_group_size:获取集群数量。 + +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype +from mindspore.communication.management import get_rank, get_group_size + +def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1, shard_id=0, num_shards=8): + """ + create dataset for train or test + """ + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. + shard_id = get_rank() + num_shards = get_group_size() + # define dataset + mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu_multiNode/lenet.py b/npu_multiNode/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/npu_multiNode/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/npu_multiNode/train.py b/npu_multiNode/train.py new file mode 100755 index 0000000..7471a3f --- /dev/null +++ b/npu_multiNode/train.py @@ -0,0 +1,211 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + └── train + + +2、Single dataset training requires predefined functions +(1)Copy single dataset from obs to training image +function ObsToEnv(obs_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(3)Download the input from Qizhi And Init +function DownloadFromQizhi(obs_data_url, data_dir) + +(4)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank, get_group_size +import mindspore.ops as ops +import time + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy the output to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return +def DownloadFromQizhi(obs_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + node_num = get_group_size() + if device_num == 1: + ObsToEnv(obs_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1 and node_num == 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + ObsToEnv(obs_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + if node_num > 1: + ObsToEnv(obs_data_url,data_dir) + return +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= '/cache/data/') + +parser.add_argument('--train_url', + help='output folder to save/load', + default= '/cache/output/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +parser.add_argument('--distributed', + type=bool, + default=True, + help='Whether to perform distributed training.') + +if __name__ == "__main__": + args = parser.parse_args() + data_dir = '/cache/data' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + if args.distributed: + init() + DownloadFromQizhi(args.data_url, data_dir) + ###The dataset path is used here:data_dir +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir, "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + ###Copy the trained output data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + UploadToQizhi(train_dir,args.train_url) diff --git a/npu_multiNode/说明.md b/npu_multiNode/说明.md new file mode 100755 index 0000000..8aa9aac --- /dev/null +++ b/npu_multiNode/说明.md @@ -0,0 +1,5 @@ + + +需要是多机多节点的任务,比如2节点2卡 + +数据集选1个就行,mnistData。 \ No newline at end of file diff --git a/npu_new/README.md b/npu_new/README.md new file mode 100755 index 0000000..2080817 --- /dev/null +++ b/npu_new/README.md @@ -0,0 +1,99 @@ + +# 如何在启智平台上进行模型训练 - NPU版本 +- **启智集群和智算网络集群的单数据集训练,多数据集训练,训练使用方式不同,请按需求选择一种训练方式即可,注意区别(以下环境默认是训练环境)**: + - 启智集群单数据集单卡或多卡的训练示例请参考示例中[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)的代码注释 + - 启智集群单数据集单卡的推理示例请参考示例中[inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py)的代码注释 + - 启智集群多数据集单卡或多卡的训练示例请参考示例中[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_multidataset.py)的代码注释 + - 智算网络集群单数据集单卡或多卡训练示例请参考示例中[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_c2net.py)的代码注释 + - 更多关于分布式训练的教程可参考mindspore官网教程[mindspore分布式训练教程](https://www.mindspore.cn/tutorial/training/zh-CN/r1.2/advanced_use/distributed_training_ascend.html) +- **NPU启智集群中单数据集和多数据集的区别**: + - 超参数不同: + 单数据集的超参数通过--data_url传递 + 多数据集的超参数通过--multi_data_url传递,并且需要保留--data_url + - 数据集使用方式不同: + 如本示例中单数据集MNISTData.zip的使用方式是:数据集位于/cache/data下 + 多数据集时MNISTData.zip的使用方式是:数据集位于/cache/data/MNISTData/下 +- **NPU启智集群和智算网络集群的区别**: + - 启智集群需要使用moxing拷贝数据到obs + - 智算网络集群不需要moxing拷贝数据到obs +- **NPU启智集群调试镜像和训练镜像的环境的区别**: + - 若想要使用调试环境的多卡并行训练,可参考示例[调试环境多卡并行示例](https://git.openi.org.cn/OpenIOSSG/MNIST_Example_NPU_Debug) + +## 1 概述 +- 本项目以LeNet-MNIST为例,简要介绍如何在启智AI协同平台上使用MindSpore完成训练任务,并提供单数据集的训练,多数据集的训练,智算网络的训练,单数据集推理等训练代码示例,旨在为AI开发者提供启智npu训练示例。对于示例代码有任何问题,欢迎在本项目中提issue。 +- 用户可以直接使用本项目提供的数据集和代码文件创建自己的训练任务。 +- 启智平台对接ModelArts和OBS,将数据集,代码,训练资源池等整合在启智AI协同平台上供开发者使用。 + - ModelArts是华为云提供的面向开发者的一站式AI开发平台,集成了昇腾AI处理器资源池,用户可以在ModelArts下体验MindSpore。 + - OBS是华为云提供的存储方式。 + +## 2 准备工作 +- 启智平台使用准备,本项目需要用户创建启智平台账户,克隆代码到自己的账户,上传数据集,具体操作方法可以通过访问[OpenI_Learning](https://git.openi.org.cn/zeizei/OpenI_Learning)项目学习小白训练营系列课程进行学习。 + +### 2.1 数据准备 +#### 数据集下载 +- 数据集可从本项目的数据集目录中下载,[数据集下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/datasets?type=1) +- 数据文件说明 + - MNISTData数据集是由10类28∗28的灰度图片组成,训练数据集包含60000张图片,测试数据集包含10000张图片。 + - 数据集压缩包的目录结构如下: + > MNIST_Data.zip + > ├── test + > │ ├── t10k-images-idx3-ubyte + > │ └── t10k-labels-idx1-ubyte + > └── train + > ├── train-images-idx3-ubyte + > └── train-labels-idx1-ubyte + + > checkpoint_lenet-1_1875.zip + > ├── checkpoint_lenet-1_1875.ckpt + +#### 数据集上传 +- 由于本示例使用的是Mindspore开发,需要在NPU芯片运行,所以上传的数据集需要传到NPU界面。\ +【注意:如果你需要试运行本示例,则无需再次上传数据集,因为本示例中的数据集MNIST_Example已经设置为公开数据集,可以直接引用或点赞收藏后使用】 +- 如下所示: +- ![avatar](Example_Picture/数据集上传位置.png) +### 2.2 执行脚本准备 +#### 示例代码 +- 示例代码可从本仓库中下载,[代码下载](https://git.openi.org.cn/OpenIOSSG/MNIST_Example) +- 代码文件说明 + - [train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py),启智集群单数据集训练的脚本文件,包括将数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[train.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train.py)的代码注释 + + - [train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_c2net.py),智算网络训练的脚本文件,包括指定迭代次数等。具体说明请参考[train_for_c2net.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_c2net.py)的代码注释 + + - [train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_multidataset.py),启智集群包括多数据集训练的脚本文件,将多数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[train_for_multidataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/train_for_multidataset.py)的代码注释 + + - [inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py),启智集群用于推理的脚本文件,包括将数据集从obs拷贝到训练镜像中、指定迭代次数、把训练后的模型数据拷贝回obs等。具体说明请参考[inference.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/inference.py)的代码注释 + + - [config.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/config.py),网络配置信息,在单数据集训练,多数据集训练,智算网络训练等训练脚本中会使用到。 + + - [dataset.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/dataset.py),对原始数据集进行预处理,产生可用于网络训练的数据集,在单数据集的训练,多数据集的训练,智算网络的训练等训练脚本中会使用到。 + + - [lenet.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/lenet.py),使用的训练网络,在单数据集训练,多数据集训练,智算网络训练等训练脚本中会使用到。 + - [dataset_distributes.py](https://git.openi.org.cn/OpenIOSSG/MNIST_Example/src/branch/master/dataset_distributes.py),对原始数据集进行预处理,产生可用于单机多卡训练的数据集。 + +## 3 创建训练任务 +- 准备好数据和执行脚本以后,需要创建训练任务将MindSpore脚本真正运行起来。首次使用的用户可参考本示例代码。 + +### 使用MindSpore作为训练框架创建训练作业,界面截图如下图所示。 +![avatar](Example_Picture/新建训练任务页面.png) + + +表1 创建训练作业界面参数说明 + +| 参数名称 | 说明 | +| ----------------- | ----------- | +| 代码分支 | 选择仓库代码中要使用的代码分支,默认可选择master分支。 | +| AI引擎 | AI引擎选择[Ascend-Powered-Engine]和所需的MindSpore版本(本示例图片为 [Mindspore-1.3.0-python3.7-aarch64],请注意使用与所选版本对应的脚本)。 | +| 启动文件 | 启动文件选择代码目录下的启动脚本。 | +| 数据集 | 数据集选择已上传到启智平台的数据集。 | +| 运行参数 | 单数据集数据存储位置和训练输出位置分别对应运行参数data_url和train_url,注意多数据集需要增加参数multi_data_url并在代码中声明,选择增加运行参数可以向脚本中其他参数传值,如epoch_size。在这里只需填入其他参数传值,data_url和train_url已默认加入运行参数,用户无需重复指定,只需在代码中指定。 | +| 资源池 | 规格选择[Ascend: 1 * Ascend 910 CPU:24 核 256GiB],表示单机单卡 | + + +## 4 查看运行结果 +### 4.1 在训练作业界面可以查看运行日志 +![avatar](Example_Picture/查看日志页面.png) +### 4.2 训练结束后可以下载模型文件 +![avatar](Example_Picture/模型下载页面.png) + +## 对于示例代码有任何问题,欢迎在本项目中提issue。 \ No newline at end of file diff --git a/npu_new/config.py b/npu_new/config.py new file mode 100755 index 0000000..22d68e2 --- /dev/null +++ b/npu_new/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 150, + 'air_name': "lenet", +}) diff --git a/npu_new/dataset.py b/npu_new/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/npu_new/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu_new/dataset_distributed.py b/npu_new/dataset_distributed.py new file mode 100755 index 0000000..66cca60 --- /dev/null +++ b/npu_new/dataset_distributed.py @@ -0,0 +1,54 @@ + +""" +Produce the dataset: +与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: +get_rank:获取当前设备在集群中的ID。 +get_group_size:获取集群数量。 + +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype +from mindspore.communication.management import get_rank, get_group_size + +def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1, shard_id=0, num_shards=8): + """ + create dataset for train or test + """ + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. + shard_id = get_rank() + num_shards = get_group_size() + # define dataset + mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu_new/inference.py b/npu_new/inference.py new file mode 100755 index 0000000..04fde28 --- /dev/null +++ b/npu_new/inference.py @@ -0,0 +1,139 @@ +""" +######################## single-dataset inference lenet example ######################## +This example is a single-dataset inference tutorial. + +######################## Instructions for using the inference environment ######################## +1、Inference task requires predefined functions +(1)Copy single dataset from obs to inference image. +function ObsToEnv(obs_data_url, data_dir) + +(2)Copy ckpt file from obs to inference image. +function ObsUrlToEnv(obs_ckpt_url, ckpt_url) + +(3)Copy the output result to obs. +function EnvToObs(train_dir, obs_train_url) + +3、4 parameters need to be defined. +--data_url is the dataset you selected on the Qizhi platform +--ckpt_url is the weight file you choose on the Qizhi platform + +--data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a single dataset, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +Inference task uses data_url as the input, and data_dir (ie: '/cache/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. +""" + +import os +import argparse +import moxing as mox +import mindspore.nn as nn +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import Tensor +import numpy as np +from glob import glob +from dataset import create_dataset +from config import mnist_cfg as cfg +from lenet import LeNet5 + +### Copy single dataset from obs to inference image ### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + return +### Copy ckpt file from obs to inference image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return +### Copy the output result to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + +### --data_url,--ckpt_url,--result_url,--device_target,These 4 parameters must be defined first in a inference task, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + type=str, + default= '/cache/data/', + help='path where the dataset is saved') +parser.add_argument('--ckpt_url', + help='model to save/load', + default= '/cache/checkpoint.ckpt') +parser.add_argument('--result_url', + help='result folder to save/load', + default= '/cache/result/') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], + help='device where the code will be implemented (default: Ascend)') + +if __name__ == "__main__": + args = parser.parse_args() + + ###Initialize the data and result directories in the inference image### + data_dir = '/cache/data' + result_dir = '/cache/result' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + ###Copy dataset from obs to inference image + ObsToEnv(args.data_url, data_dir) + + ###Copy ckpt file from obs to inference image + ObsUrlToEnv(args.ckpt_url, ckpt_url) + + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + repeat_size = cfg.epoch_size + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + + print("============== Starting Testing ==============") + + param_dict = load_checkpoint(os.path.join(ckpt_url)) + load_param_into_net(network, param_dict) + ds_test = create_dataset(os.path.join(data_dir, "test"), batch_size=1).create_dict_iterator() + data = next(ds_test) + images = data["image"].asnumpy() + labels = data["label"].asnumpy() + print('Tensor:', Tensor(data['image'])) + output = model.predict(Tensor(data['image'])) + predicted = np.argmax(output.asnumpy(), axis=1) + pred = np.argmax(output.asnumpy(), axis=1) + print('predicted:', predicted) + print('pred:', pred) + + print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') + filename = 'result.txt' + file_path = os.path.join(result_dir, filename) + with open(file_path, 'a+') as file: + file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) + + ###Copy result data from the local running environment back to obs, + ###and download it in the inference task corresponding to the Qizhi platform + EnvToObs(result_dir, args.result_url) \ No newline at end of file diff --git a/npu_new/inference_for_multidataset.py b/npu_new/inference_for_multidataset.py new file mode 100755 index 0000000..cf7ae1e --- /dev/null +++ b/npu_new/inference_for_multidataset.py @@ -0,0 +1,158 @@ +""" +######################## multi-dataset inference lenet example ######################## +This example is a single-dataset inference tutorial. + +######################## Instructions for using the inference environment ######################## +1、Inference task requires predefined functions +(1)Copy multi dataset from obs to inference image. +function MultiObsToEnv(obs_data_url, data_dir) + +(2)Copy ckpt file from obs to inference image. +function ObsUrlToEnv(obs_ckpt_url, ckpt_url) + +(3)Copy the output result to obs. +function EnvToObs(train_dir, obs_train_url) + +3、5 parameters need to be defined. +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi dataset you selected on the Qizhi platform +--ckpt_url is the weight file you choose on the Qizhi platform +--result_url is the output + +--data_url,--multi_data_url,--ckpt_url,--result_url,--device_target,These 5 parameters must be defined first in a single dataset, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the +calling path of the dataset in the inference image. +For example, the calling path of the test folder in the MNIST_Data dataset in this example is +data_dir + "/MNIST_Data" +"/test" + +For details, please refer to the following sample code. +""" + +import os +import argparse +import moxing as mox +import mindspore.nn as nn +from mindspore import context +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import Tensor +import numpy as np +from glob import glob +from dataset import create_dataset +from config import mnist_cfg as cfg +from lenet import LeNet5 +import json + +### Copy multiple datasets from obs to inference image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + return +### Copy ckpt file from obs to inference image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return +### Copy the output result to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return + + + +### --data_url,--multi_data_url,--ckpt_url,--result_url,--device_target,These 5 parameters must be defined first in a multi dataset inference task, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + type=str, + default= '/cache/data1/', + help='path where the dataset is saved') +parser.add_argument('--multi_data_url', + type=str, + default= '/cache/data/', + help='path where the dataset is saved') +parser.add_argument('--ckpt_url', + help='model to save/load', + default= '/cache/checkpoint.ckpt') +parser.add_argument('--result_url', + help='result folder to save/load', + default= '/cache/result/') +parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], + help='device where the code will be implemented (default: Ascend)') + +if __name__ == "__main__": + args = parser.parse_args() + + ###Initialize the data and result directories in the inference image### + data_dir = '/cache/data' + result_dir = '/cache/result' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(result_dir): + os.makedirs(result_dir) + + ###Copy multiple dataset from obs to inference image + MultiObsToEnv(args.multi_data_url, data_dir) + + ###Copy ckpt file from obs to inference image + ObsUrlToEnv(args.ckpt_url, ckpt_url) + + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + repeat_size = cfg.epoch_size + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + + print("============== Starting Testing ==============") + + param_dict = load_checkpoint(os.path.join(ckpt_url)) + load_param_into_net(network, param_dict) + ds_test = create_dataset(os.path.join(data_dir + "/MNISTData", "test"), batch_size=1).create_dict_iterator() + data = next(ds_test) + images = data["image"].asnumpy() + labels = data["label"].asnumpy() + print('Tensor:', Tensor(data['image'])) + output = model.predict(Tensor(data['image'])) + predicted = np.argmax(output.asnumpy(), axis=1) + pred = np.argmax(output.asnumpy(), axis=1) + print('predicted:', predicted) + print('pred:', pred) + + print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') + filename = 'result.txt' + file_path = os.path.join(result_dir, filename) + with open(file_path, 'a+') as file: + file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) + + ###Copy result data from the local running environment back to obs, + ###and download it in the inference task corresponding to the Qizhi platform + EnvToObs(result_dir, args.result_url) \ No newline at end of file diff --git a/npu_new/lenet.py b/npu_new/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/npu_new/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x diff --git a/npu_new/train.py b/npu_new/train.py new file mode 100755 index 0000000..383b1e0 --- /dev/null +++ b/npu_new/train.py @@ -0,0 +1,201 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + └── train + + +2、Single dataset training requires predefined functions +(1)Copy single dataset from obs to training image +function ObsToEnv(obs_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(3)Download the input from Qizhi And Init +function DownloadFromQizhi(obs_data_url, data_dir) + +(4)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import mindspore.ops as ops +import time + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy the output to obs### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir,obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e)) + return +def DownloadFromQizhi(obs_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ObsToEnv(obs_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + ObsToEnv(obs_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= '/cache/data/') + +parser.add_argument('--train_url', + help='output folder to save/load', + default= '/cache/output/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args = parser.parse_args() + data_dir = '/cache/data' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.data_url, data_dir) + ###The dataset path is used here:data_dir +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir, "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + + ###Copy the trained output data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + UploadToQizhi(train_dir,args.train_url) diff --git a/npu_new/train_for_c2net.py b/npu_new/train_for_c2net.py new file mode 100755 index 0000000..a044e34 --- /dev/null +++ b/npu_new/train_for_c2net.py @@ -0,0 +1,99 @@ +""" +######################## train lenet dataparallel example ######################## +train lenet and get network model files(.ckpt) + +The training of the intelligent computing network currently supports single dataset training, and does not require +the obs copy process.It only needs to define two parameters and then call it directly: + train_dir = '/cache/output' #The location of the output + data_dir = '/cache/dataset' #The location of the dataset + +""" + +import os +import argparse +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +import moxing as mox +from config import mnist_cfg as cfg +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.common import set_seed +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank, get_group_size +import mindspore.ops as ops + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') +if __name__ == "__main__": + args = parser.parse_args() + + ###define two parameters and then call it directly### + data_dir = '/cache/dataset' + train_dir = '/cache/output' + + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + ds_train = create_dataset(os.path.join(data_dir, "train"), cfg.batch_size) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + ds_train = create_dataset_parallel(os.path.join(data_dir, "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In the example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size,ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=False) + + diff --git a/npu_new/train_for_multidataset.py b/npu_new/train_for_multidataset.py new file mode 100755 index 0000000..9a4096a --- /dev/null +++ b/npu_new/train_for_multidataset.py @@ -0,0 +1,220 @@ +""" +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + └── train + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Copy multi-dataset from obs to training image +function MultiObsToEnv(multi_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(2)Download the input from Qizhi And Init +function DownloadFromQizhi(multi_data_url, data_dir) + +(2)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +3、4 parameters need to be defined +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +data_dir + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + if not os.path.exists(path): + os.makedirs(path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy the output model to obs ### +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. +parser.add_argument('--data_url', + help='path to training/inference dataset folder', + default= '/cache/data1/') + +parser.add_argument('--multi_data_url', + help='path to multi dataset', + default= '/cache/data/') + +parser.add_argument('--train_url', + help='model folder to save/load', + default= '/cache/output/') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args = parser.parse_args() + data_dir = '/cache/data' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The dataset path is used here:data_dir + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" + load_param_into_net(network, load_checkpoint(os.path.join(data_dir + "/checkpoint_lenet-1_1875", + "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + ###Copy the trained output data from the local running environment back to obs, + ###and download it in the training task corresponding to the Qizhi platform + UploadToQizhi(train_dir,args.train_url) + diff --git a/test.py b/test.py new file mode 100644 index 0000000..ff675e5 --- /dev/null +++ b/test.py @@ -0,0 +1 @@ +print('this is test.py, test 123') \ No newline at end of file diff --git a/test引号.md b/test引号.md new file mode 100644 index 0000000..d0ee501 --- /dev/null +++ b/test引号.md @@ -0,0 +1,10 @@ +"" +“中文双引号” +"vim" +“” +hello world “中文双引号” +hi "en" +'mark' +'' + +"abc hello" \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..0edaf3e --- /dev/null +++ b/train.py @@ -0,0 +1,91 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +Due to the adaptability of a100, before using the training environment, please use the recommended image of the +platform with cuda 11.Then adjust the code and submit the image. +The image of this example is: dockerhub.pcl.ac.cn:5000/user-images/openi:cuda111_python37_pytorch191 +In the training environment, the uploaded dataset will be automatically placed in the /dataset directory. +If it is a single dataset: +if MnistDataset_torch.zip is selected,Then the dataset directory is /dataset/train, /dataset/test; +If it is a multiple dataset: +If MnistDataset_torch.zip and checkpoint_epoch1_0.73.zip are selected, +the dataset directory is /dataset/MnistDataset_torch/train, /dataset/MnistDataset_torch/test +and /dataset/checkpoint_epoch1_0.73/mnist_epoch1_0.73.pkl + +The model download path is under /model by default. Please specify the model output location to /model, +and the Qizhi platform will provide file downloads under the /model directory. +''' + + +from model import Model +import numpy as np +import torch +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import datetime + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +#The dataset location is placed under /dataset +parser.add_argument('--traindata', default="/dataset/train" ,help='path to train dataset') +parser.add_argument('--testdata', default="/dataset/test" ,help='path to test dataset') +parser.add_argument('--epoch_size', type=int, default=1, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +def gettime(): + timestr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + return timestr + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + #log output + print(gettime(), 'cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + train_dataset = mnist.MNIST(root=args.traindata, train=True, transform=ToTensor(),download=False) + test_dataset = mnist.MNIST(root=args.testdata, train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) + test_loader = DataLoader(test_dataset, batch_size=batch_size) + model = Model().to(device) + sgd = SGD(model.parameters(), lr=1e-1) + cost = CrossEntropyLoss() + epoch = args.epoch_size + print(gettime(), 'epoch_size is:{}'.format(epoch)) + for _epoch in range(epoch): + print(gettime(), 'the {} epoch_size begin'.format(_epoch + 1)) + model.train() + for idx, (train_x, train_label) in enumerate(train_loader): + train_x = train_x.to(device) + train_label = train_label.to(device) + label_np = np.zeros((train_label.shape[0], 10)) + sgd.zero_grad() + predict_y = model(train_x.float()) + loss = cost(predict_y, train_label.long()) + if idx % 10 == 0: + print(gettime(), 'idx: {}, loss: {}'.format(idx, loss.sum().item())) + loss.backward() + sgd.step() + + correct = 0 + _sum = 0 + model.eval() + for idx, (test_x, test_label) in enumerate(test_loader): + test_x = test_x + test_label = test_label + predict_y = model(test_x.to(device).float()).detach() + predict_ys = np.argmax(predict_y.cpu(), axis=-1) + label_np = test_label.numpy() + _ = predict_ys == test_label + correct += np.sum(_.numpy(), axis=-1) + _sum += _.shape[0] + print(gettime(), 'accuracy: {:.2f}'.format(correct / _sum)) + #The model output location is placed under /model + torch.save(model, '/model/mnist_epoch{}_{:.2f}.pkl'.format(_epoch+1, correct / _sum)) diff --git a/wjtes20220926-log.txt b/wjtes20220926-log.txt new file mode 100644 index 0000000..3b487a6 --- /dev/null +++ b/wjtes20220926-log.txt @@ -0,0 +1,233 @@ +/home/work + +start loading script + +finish loading script + +2022/09/26 16:24:20 Start to download master.zip + +2022/09/26 16:24:20 Total parts count 1 + +2022/09/26 16:24:21 part(1) finished + +2022/09/26 16:24:21 Download object finished, downloadPath:/cache/code/master.zip + +panic: runtime error: index out of range [4] with length 4 + + + +goroutine 1 [running]: + +main.main() + + /home/houysh/openi/lewis/sync_for_grampus/downloader_for_obs.go:41 +0x4e0 + +unzip finished;start to exec code; + +do nothing + +[Modelarts Service Log]user: uid=1101(work) gid=1101(work) groups=1101(work),1000(HwHiAiUser) + +[Modelarts Service Log]pwd: /home/work + +[Modelarts Service Log]boot_file: /cache/code/npu_test/npu/train_for_c2net.py + +[Modelarts Service Log]log_url: /tmp/log/train.log + +[Modelarts Service Log]command: /cache/code/npu_test/npu/train_for_c2net.py + +[Modelarts Service Log]local_code_dir: + +[Modelarts Service Log]Training start at 2022-09-26-16:24:21 + +[Modelarts Service Log][modelarts_create_log] modelarts-pipe found + +[ModelArts Service Log]modelarts-pipe: will create log file /tmp/log/train.log + +[Modelarts Service Log][modelarts_logger] modelarts-pipe found + +[ModelArts Service Log]modelarts-pipe: will create log file /tmp/log/train.log + +[ModelArts Service Log]modelarts-pipe: will write log file /tmp/log/train.log + +[ModelArts Service Log]modelarts-pipe: param for max log length: 1073741824 + +[ModelArts Service Log]modelarts-pipe: param for whether exit on overflow: 0 + +INFO:root:Using MoXing-v2.0.0.rc2.4b57a67b-4b57a67b + +INFO:root:Using OBS-Python-SDK-3.20.9.1 + +[Modelarts Service Log]2022-09-26 16:24:22,746 - INFO - Ascend Driver: Version=22.0.0.3 + +[Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - you are advised to use ASCEND_DEVICE_ID env instead of DEVICE_ID, as the DEVICE_ID env will be discarded in later versions + +[Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - particularly, ${ASCEND_DEVICE_ID} == ${DEVICE_ID}, it's the logical device id + +[Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - Davinci training command + +[Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - ['/usr/bin/python', '/cache/code/npu_test/npu/train_for_c2net.py'] + +[Modelarts Service Log]2022-09-26 16:24:22,747 - INFO - Wait for Rank table file ready + +[Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - Rank table file (K8S generated) is ready for read + +[Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - + +{ + + "status": "completed", + + "group_count": "1", + + "group_list": [ + + { + + "group_name": "job-wjtes2022092616t2327", + + "device_count": "1", + + "instance_count": "1", + + "instance_list": [ + + { + + "pod_name": "joba57ac677-job-wjtes2022092616t2327-0", + + "server_id": "192.168.0.189", + + "devices": [ + + { + + "device_id": "3", + + "device_ip": "192.4.68.236" + + } + + ] + + } + + ] + + } + + ] + +} + +[Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - Rank table file (C7x) + +[Modelarts Service Log]2022-09-26 16:24:22,748 - INFO - + +{ + + "status": "completed", + + "version": "1.0", + + "server_count": "1", + + "server_list": [ + + { + + "server_id": "192.168.0.189", + + "device": [ + + { + + "device_id": "3", + + "device_ip": "192.4.68.236", + + "rank_id": "0" + + } + + ] + + } + + ] + +} + +[Modelarts Service Log]2022-09-26 16:24:22,749 - INFO - Rank table file (C7x) is generated + +[Modelarts Service Log]2022-09-26 16:24:22,749 - INFO - Current server + +[Modelarts Service Log]2022-09-26 16:24:22,749 - INFO - + +{ + + "server_id": "192.168.0.189", + + "device": [ + + { + + "device_id": "3", + + "device_ip": "192.4.68.236", + + "rank_id": "0" + + } + + ] + +} + +[Modelarts Service Log]2022-09-26 16:24:22,750 - INFO - bootstrap proc-rank-0-device-0 + +args: + +Namespace(device_target='Ascend', epoch_size=5) + +Traceback (most recent call last): + + File "/cache/code/npu_test/npu/train_for_c2net.py", line 50, in + + cfg.batch_size) + + File "/cache/code/npu_test/npu/dataset.py", line 32, in create_dataset + + mnist_ds = ds.MnistDataset(data_path) + + File "/usr/local/ma/python3.7/lib/python3.7/site-packages/mindspore/dataset/engine/validators.py", line 343, in new_method + + check_dir(dataset_dir) + + File "/usr/local/ma/python3.7/lib/python3.7/site-packages/mindspore/dataset/core/validator_helpers.py", line 551, in check_dir + + raise ValueError("The folder {} does not exist or is not a directory or permission denied!".format(dataset_dir)) + +ValueError: The folder /cache/dataset/train does not exist or is not a directory or permission denied! + +[Modelarts Service Log]2022-09-26 16:24:31,765 - ERROR - proc-rank-0-device-0 (pid: 159) has exited with non-zero code: 1 + +[Modelarts Service Log]2022-09-26 16:24:31,765 - INFO - Begin destroy training processes + +[Modelarts Service Log]2022-09-26 16:24:31,765 - INFO - proc-rank-0-device-0 (pid: 159) has exited + +[Modelarts Service Log]2022-09-26 16:24:31,765 - INFO - End destroy training processes + +[ModelArts Service Log]modelarts-pipe: total length: 3763 + +[Modelarts Service Log]Training end with return code: 1 + +[Modelarts Service Log]Training end at 2022-09-26-16:24:31 + +[Modelarts Service Log]Training completed. + +2022/09/26 16:24:51 start uploading model + +2022/09/26 16:24:51 file:train.log + +2022/09/26 16:24:52 finish uploading model \ No newline at end of file