diff --git a/npu/lewis/c2net_npu.py b/npu/lewis/c2net_npu.py new file mode 100755 index 0000000..9f33cce --- /dev/null +++ b/npu/lewis/c2net_npu.py @@ -0,0 +1,149 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + diff --git a/npu/lewis/c2net_npu_multi_dataset.py b/npu/lewis/c2net_npu_multi_dataset.py new file mode 100755 index 0000000..fd36194 --- /dev/null +++ b/npu/lewis/c2net_npu_multi_dataset.py @@ -0,0 +1,197 @@ +""" +######################## multi-dataset train lenet example ######################## +This example is a multi-dataset training tutorial. If it is a single dataset, please refer to the single dataset +training tutorial train.py. This example cannot be used for a single dataset! +""" +""" +######################## Instructions for using the training environment ######################## +1、(1)The structure of the dataset uploaded for multi-dataset training in this example + MNISTData.zip + ├── test + └── train + + checkpoint_lenet-1_1875.zip + ├── checkpoint_lenet-1_1875.ckpt + + (2)The dataset structure in the training image for multiple datasets in this example + workroot + ├── MNISTData + | ├── test + | └── train + └── checkpoint_lenet-1_1875 + ├── checkpoint_lenet-1_1875.ckpt + +2、Multi-dataset training requires predefined functions +(1)Copy multi-dataset from obs to training image +function MultiObsToEnv(multi_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(2)Download the input from Qizhi And Init +function DownloadFromQizhi(multi_data_url, data_dir) + +(2)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +3、4 parameters need to be defined +--data_url is the first dataset you selected on the Qizhi platform +--multi_data_url is the multi-dataset you selected on the Qizhi platform + +--data_url,--multi_data_url,--train_url,--device_target,These 4 parameters must be defined first in a multi-dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code + +4、How the dataset is used +Multi-datasets use multi_data_url as input, data_dir + dataset name + file or folder name in the dataset as the +calling path of the dataset in the training image. +For example, the calling path of the train folder in the MNIST_Data dataset in this example is +data_dir + "/MNIST_Data" +"/train" + +For details, please refer to the following sample code. +""" + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir + "/MNIST_Data" +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir + "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError( + "Please check dataset size > 0 and batch_size <= dataset size") + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The dataset path is used here:data_dir + "/checkpoint_lenet-1_1875"+"/checkpoint_lenet-1_1875.ckpt" + load_param_into_net(network, load_checkpoint(os.path.join(data_dir + "/checkpoint_lenet-1_1875", + "checkpoint_lenet-1_1875.ckpt"))) + if args.device_target != "Ascend": + model = Model(network,net_loss,net_opt,metrics={"accuracy": Accuracy()}) + else: + model = Model(network, net_loss,net_opt,metrics={"accuracy": Accuracy()},amp_level="O2") + config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) + diff --git a/npu/lewis/c2net_npu_pretrain.py b/npu/lewis/c2net_npu_pretrain.py new file mode 100755 index 0000000..7ab8af8 --- /dev/null +++ b/npu/lewis/c2net_npu_pretrain.py @@ -0,0 +1,233 @@ +""" +######################## single-dataset train lenet example ######################## +This example is a single-dataset training tutorial. If it is a multi-dataset, please refer to the multi-dataset training +tutorial train_for_multidataset.py. This example cannot be used for multi-datasets! + +######################## Instructions for using the training environment ######################## +The image of the debugging environment and the image of the training environment are two different images, +and the working local directories are different. In the training task, you need to pay attention to the following points. +1、(1)The structure of the dataset uploaded for single dataset training in this example + MNISTData.zip + ├── test + └── train + + +2、Single dataset training requires predefined functions +(1)Copy single dataset from obs to training image +function ObsToEnv(obs_data_url, data_dir) + +(2)Copy the output to obs +function EnvToObs(train_dir, obs_train_url) + +(3)Download the input from Qizhi And Init +function DownloadFromQizhi(obs_data_url, data_dir) + +(4)Upload the output to Qizhi +function UploadToQizhi(train_dir, obs_train_url) + +(5)Copy ckpt file from obs to training image. +function ObsUrlToEnv(obs_ckpt_url, ckpt_url) + +3、3 parameters need to be defined +--data_url is the dataset you selected on the Qizhi platform + +--data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset task, +otherwise an error will be reported. +There is no need to add these parameters to the running parameters of the Qizhi platform, +because they are predefined in the background, you only need to define them in your code. + +4、How the dataset is used +A single dataset uses data_url as the input, and data_dir (ie:'/cache/data') as the calling method +of the dataset in the image. +For details, please refer to the following sample code. + +5、How to load the checkpoint file +The checkpoint file is loaded by the ckpt_url parameter + +""" + +import os +import argparse +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore import load_checkpoint, load_param_into_net +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import mindspore.ops as ops +import time +import json +#from upload import UploadOutput + +### Copy single dataset from obs to training image### +def ObsToEnv(obs_data_url, data_dir): + try: + mox.file.copy_parallel(obs_data_url, data_dir) + print("Successfully Download {} to {}".format(obs_data_url, data_dir)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return +### Copy ckpt file from obs to training image### +### To operate on folders, use mox.file.copy_parallel. If copying a file. +### Please use mox.file.copy to operate the file, this operation is to operate the file +def ObsUrlToEnv(obs_ckpt_url, ckpt_url): + try: + mox.file.copy(obs_ckpt_url, ckpt_url) + print("Successfully Download {} to {}".format(obs_ckpt_url,ckpt_url)) + except Exception as e: + print('moxing download {} to {} failed: '.format(obs_ckpt_url, ckpt_url) + str(e)) + return + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, +### otherwise an error will be reported. +###There is no need to add these parameters to the running parameters of the Qizhi platform, +###because they are predefined in the background, you only need to define them in your code. +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + ###Copy ckpt file from obs to training image + ObsUrlToEnv(args.ckpt_url, ckpt_url) + ###Copy data from obs to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + ###The dataset path is used here:data_dir +"/train" + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + ds_train = create_dataset(os.path.join(data_dir+ "/MNISTData", "train"), cfg.batch_size) + if device_num > 1: + ds_train = create_dataset_parallel(os.path.join(data_dir+ "/MNISTData", "train"), cfg.batch_size) + if ds_train.get_dataset_size() == 0: + raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") + + network = LeNet5(cfg.num_classes) + net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") + net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) + time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) + ###The ckpt path is used here:ckpt_url + print('-------ckpt_url is:', args.ckpt_url) + load_param_into_net(network, load_checkpoint(ckpt_url)) + + if args.device_target != "Ascend": + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}) + else: + model = Model(network, + net_loss, + net_opt, + metrics={"accuracy": Accuracy()}, + amp_level="O2") + + config_ck = CheckpointConfig( + save_checkpoint_steps=cfg.save_checkpoint_steps, + keep_checkpoint_max=cfg.keep_checkpoint_max) + #Note that this method saves the model file on each card. You need to specify the save path on each card. + # In this example, get_rank() is added to distinguish different paths. + if device_num == 1: + outputDirectory = train_dir + "/" + if device_num > 1: + outputDirectory = train_dir + "/" + str(get_rank()) + "/" + ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", + directory=outputDirectory, + config=config_ck) + print("============== Starting Training ==============") + epoch_size = cfg['epoch_size'] + if (args.epoch_size): + epoch_size = args.epoch_size + print('epoch_size is: ', epoch_size) + + model.train(epoch_size, + ds_train, + callbacks=[time_cb, ckpoint_cb, + LossMonitor()]) \ No newline at end of file diff --git a/npu/lewis/c2net_testbigfile.py b/npu/lewis/c2net_testbigfile.py new file mode 100755 index 0000000..df94803 --- /dev/null +++ b/npu/lewis/c2net_testbigfile.py @@ -0,0 +1,114 @@ +""" +######################## train lenet example ######################## +train lenet and get network model files(.ckpt) +""" +#!/usr/bin/python +#coding=utf-8 + + +import os +import argparse + +import moxing as mox +from config import mnist_cfg as cfg +from dataset import create_dataset +from dataset_distributed import create_dataset_parallel +from lenet import LeNet5 +import json +import mindspore.nn as nn +from mindspore import context +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train import Model +from mindspore.nn.metrics import Accuracy +from mindspore import load_checkpoint, load_param_into_net +from mindspore.context import ParallelMode +from mindspore.communication.management import init, get_rank +import time + +### Copy multiple datasets from obs to training image ### +def MultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + path = data_dir + "/" + multi_data_json[i]["dataset_name"] + file_path = data_dir + "/" + os.path.splitext(multi_data_json[i]["dataset_name"])[0] + if not os.path.exists(file_path): + os.makedirs(file_path) + try: + mox.file.copy_parallel(multi_data_json[i]["dataset_url"], path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],path)) + #unzip dataset + os.system("unzip -d %s %s" % (file_path, path)) + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + MultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + MultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + +parser = argparse.ArgumentParser(description='MindSpore Lenet Example') +### --multi_data_url,--ckpt_url,--device_target,These 4 parameters must be defined first in a multi-dataset, +### otherwise an error will be reported. +### There is no need to add these parameters to the running parameters of the Qizhi platform, +### because they are predefined in the background, you only need to define them in your code. + +parser.add_argument('--multi_data_url', + help='dataset path in obs') + +parser.add_argument('--ckpt_url', + help='pre_train_model path in obs') + +parser.add_argument( + '--device_target', + type=str, + default="Ascend", + choices=['Ascend', 'CPU'], + help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') + +parser.add_argument('--epoch_size', + type=int, + default=5, + help='Training epochs.') + +if __name__ == "__main__": + args, unknown = parser.parse_known_args() + data_dir = '/cache/dataset' + train_dir = '/cache/output' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + ###Initialize and copy data to training image + DownloadFromQizhi(args.multi_data_url, data_dir) + print("--------start ls:") + os.system("cd /cache/dataset; ls -al") + print("--------end ls-----------") + diff --git a/npu/lewis/config.py b/npu/lewis/config.py new file mode 100755 index 0000000..22d68e2 --- /dev/null +++ b/npu/lewis/config.py @@ -0,0 +1,33 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py +""" + +from easydict import EasyDict as edict + +mnist_cfg = edict({ + 'num_classes': 10, + 'lr': 0.01, + 'momentum': 0.9, + 'epoch_size': 10, + 'batch_size': 32, + 'buffer_size': 1000, + 'image_height': 32, + 'image_width': 32, + 'save_checkpoint_steps': 1875, + 'keep_checkpoint_max': 150, + 'air_name': "lenet", +}) diff --git a/npu/lewis/dataset.py b/npu/lewis/dataset.py new file mode 100755 index 0000000..df9eecd --- /dev/null +++ b/npu/lewis/dataset.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Produce the dataset +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype + + +def create_dataset(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1): + """ + create dataset for train or test + """ + # define dataset + mnist_ds = ds.MnistDataset(data_path) + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/lewis/dataset_distributed.py b/npu/lewis/dataset_distributed.py new file mode 100755 index 0000000..d813078 --- /dev/null +++ b/npu/lewis/dataset_distributed.py @@ -0,0 +1,55 @@ + +""" +Produce the dataset: +与单机不同的是,在数据集接口需要传入num_shards和shard_id参数,分别对应卡的数量和逻辑序号,建议通过HCCL接口获取: +get_rank:获取当前设备在集群中的ID。 +get_group_size:获取集群数量。 + +""" + +import mindspore.dataset as ds +import mindspore.dataset.vision.c_transforms as CV +import mindspore.dataset.transforms.c_transforms as C +from mindspore.dataset.vision import Inter +from mindspore.common import dtype as mstype +from mindspore.communication.management import init, get_rank, get_group_size + + +def create_dataset_parallel(data_path, batch_size=32, repeat_size=1, + num_parallel_workers=1, shard_id=0, num_shards=8): + """ + create dataset for train or test + """ + + resize_height, resize_width = 32, 32 + rescale = 1.0 / 255.0 + shift = 0.0 + rescale_nml = 1 / 0.3081 + shift_nml = -1 * 0.1307 / 0.3081 + # get shard_id and num_shards.Get the ID of the current device in the cluster And Get the number of clusters. + shard_id = get_rank() + num_shards = get_group_size() + # define dataset + mnist_ds = ds.MnistDataset(data_path, num_shards=num_shards, shard_id=shard_id) + + # define map operations + resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode + rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) + rescale_op = CV.Rescale(rescale, shift) + hwc2chw_op = CV.HWC2CHW() + type_cast_op = C.TypeCast(mstype.int32) + + # apply map operations on images + mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) + mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) + + # apply DatasetOps + buffer_size = 10000 + mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script + mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) + mnist_ds = mnist_ds.repeat(repeat_size) + + return mnist_ds diff --git a/npu/lewis/lenet.py b/npu/lewis/lenet.py new file mode 100755 index 0000000..0600793 --- /dev/null +++ b/npu/lewis/lenet.py @@ -0,0 +1,60 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""LeNet.""" +import mindspore.nn as nn +from mindspore.common.initializer import Normal + + +class LeNet5(nn.Cell): + """ + Lenet network + + Args: + num_class (int): Number of classes. Default: 10. + num_channel (int): Number of channels. Default: 1. + + Returns: + Tensor, output tensor + Examples: + >>> LeNet(num_class=10) + + """ + def __init__(self, num_class=10, num_channel=1, include_top=True): + super(LeNet5, self).__init__() + self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') + self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') + self.relu = nn.ReLU() + self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) + self.include_top = include_top + if self.include_top: + self.flatten = nn.Flatten() + self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) + self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) + self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) + + def construct(self, x): + x = self.conv1(x) + x = self.relu(x) + x = self.max_pool2d(x) + x = self.conv2(x) + x = self.relu(x) + x = self.max_pool2d(x) + if not self.include_top: + return x + x = self.flatten(x) + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x