You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.py 12 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """
  16. #################train vgg16 example on cifar10########################
  17. python train.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
  18. """
  19. import argparse
  20. import datetime
  21. import time
  22. import os
  23. import random
  24. import numpy as np
  25. import mindspore.nn as nn
  26. from mindspore import Tensor
  27. from mindspore import context
  28. from mindspore.communication.management import init, get_rank, get_group_size
  29. from mindspore.nn.optim.momentum import Momentum
  30. from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig
  31. from mindspore.train.model import Model, ParallelMode
  32. from mindspore.train.serialization import load_param_into_net, load_checkpoint
  33. from mindspore.train.loss_scale_manager import FixedLossScaleManager
  34. from src.dataset import vgg_create_dataset
  35. from src.dataset import classification_dataset
  36. from src.crossentropy import CrossEntropy
  37. from src.warmup_step_lr import warmup_step_lr
  38. from src.warmup_cosine_annealing_lr import warmup_cosine_annealing_lr
  39. from src.warmup_step_lr import lr_steps
  40. from src.utils.logging import get_logger
  41. from src.utils.util import get_param_groups
  42. from src.vgg import vgg16
  43. random.seed(1)
  44. np.random.seed(1)
  45. class ProgressMonitor(Callback):
  46. """monitor loss and time"""
  47. def __init__(self, args_param):
  48. super(ProgressMonitor, self).__init__()
  49. self.me_epoch_start_time = 0
  50. self.me_epoch_start_step_num = 0
  51. self.args = args_param
  52. self.ckpt_history = []
  53. def begin(self, run_context):
  54. self.args.logger.info('start network train...')
  55. def epoch_begin(self, run_context):
  56. pass
  57. def epoch_end(self, run_context):
  58. """
  59. Called after each epoch finished.
  60. Args:
  61. run_context (RunContext): Include some information of the model.
  62. """
  63. cb_params = run_context.original_args()
  64. me_step = cb_params.cur_step_num - 1
  65. real_epoch = me_step // self.args.steps_per_epoch
  66. time_used = time.time() - self.me_epoch_start_time
  67. fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used
  68. self.args.logger.info('epoch[{}], iter[{}], loss:{}, mean_fps:{:.2f}'
  69. 'imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean))
  70. if self.args.rank_save_ckpt_flag:
  71. import glob
  72. ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt'))
  73. for ckpt in ckpts:
  74. ckpt_fn = os.path.basename(ckpt)
  75. if not ckpt_fn.startswith('{}-'.format(self.args.rank)):
  76. continue
  77. if ckpt in self.ckpt_history:
  78. continue
  79. self.ckpt_history.append(ckpt)
  80. self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},'
  81. 'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn))
  82. self.me_epoch_start_step_num = me_step
  83. self.me_epoch_start_time = time.time()
  84. def step_begin(self, run_context):
  85. pass
  86. def step_end(self, run_context, *me_args):
  87. pass
  88. def end(self, run_context):
  89. self.args.logger.info('end network train...')
  90. def parse_args(cloud_args=None):
  91. """parameters"""
  92. parser = argparse.ArgumentParser('mindspore classification training')
  93. parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
  94. help='device where the code will be implemented. (Default: Ascend)')
  95. parser.add_argument('--device_id', type=int, default=1, help='device id of GPU or Ascend. (Default: None)')
  96. # dataset related
  97. parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10")
  98. parser.add_argument('--data_path', type=str, default='', help='train data dir')
  99. # network related
  100. parser.add_argument('--pre_trained', default='', type=str, help='model_path, local pretrained model to load')
  101. parser.add_argument('--lr_gamma', type=float, default=0.1,
  102. help='decrease lr by a factor of exponential lr_scheduler')
  103. parser.add_argument('--eta_min', type=float, default=0., help='eta_min in cosine_annealing scheduler')
  104. parser.add_argument('--T_max', type=int, default=150, help='T-max in cosine_annealing scheduler')
  105. # logging and checkpoint related
  106. parser.add_argument('--log_interval', type=int, default=100, help='logging interval')
  107. parser.add_argument('--ckpt_path', type=str, default='outputs/', help='checkpoint save location')
  108. parser.add_argument('--ckpt_interval', type=int, default=2, help='ckpt_interval')
  109. parser.add_argument('--is_save_on_master', type=int, default=1, help='save ckpt on master or all rank')
  110. # distributed related
  111. parser.add_argument('--is_distributed', type=int, default=0, help='if multi device')
  112. parser.add_argument('--rank', type=int, default=0, help='local rank of distributed')
  113. parser.add_argument('--group_size', type=int, default=1, help='world size of distributed')
  114. args_opt = parser.parse_args()
  115. args_opt = merge_args(args_opt, cloud_args)
  116. if args_opt.dataset == "cifar10":
  117. from src.config import cifar_cfg as cfg
  118. else:
  119. from src.config import imagenet_cfg as cfg
  120. args_opt.label_smooth = cfg.label_smooth
  121. args_opt.label_smooth_factor = cfg.label_smooth_factor
  122. args_opt.lr_scheduler = cfg.lr_scheduler
  123. args_opt.loss_scale = cfg.loss_scale
  124. args_opt.max_epoch = cfg.max_epoch
  125. args_opt.warmup_epochs = cfg.warmup_epochs
  126. args_opt.lr = cfg.lr
  127. args_opt.lr_init = cfg.lr_init
  128. args_opt.lr_max = cfg.lr_max
  129. args_opt.momentum = cfg.momentum
  130. args_opt.weight_decay = cfg.weight_decay
  131. args_opt.per_batch_size = cfg.batch_size
  132. args_opt.num_classes = cfg.num_classes
  133. args_opt.buffer_size = cfg.buffer_size
  134. args_opt.ckpt_save_max = cfg.keep_checkpoint_max
  135. args_opt.pad_mode = cfg.pad_mode
  136. args_opt.padding = cfg.padding
  137. args_opt.has_bias = cfg.has_bias
  138. args_opt.batch_norm = cfg.batch_norm
  139. args_opt.initialize_mode = cfg.initialize_mode
  140. args_opt.has_dropout = cfg.has_dropout
  141. args_opt.lr_epochs = list(map(int, cfg.lr_epochs.split(',')))
  142. args_opt.image_size = list(map(int, cfg.image_size.split(',')))
  143. return args_opt
  144. def merge_args(args_opt, cloud_args):
  145. """dictionary"""
  146. args_dict = vars(args_opt)
  147. if isinstance(cloud_args, dict):
  148. for key_arg in cloud_args.keys():
  149. val = cloud_args[key_arg]
  150. if key_arg in args_dict and val:
  151. arg_type = type(args_dict[key_arg])
  152. if arg_type is not None:
  153. val = arg_type(val)
  154. args_dict[key_arg] = val
  155. return args_opt
  156. if __name__ == '__main__':
  157. args = parse_args()
  158. device_num = int(os.environ.get("DEVICE_NUM", 1))
  159. if args.is_distributed:
  160. if args.device_target == "Ascend":
  161. init()
  162. elif args.device_target == "GPU":
  163. init("nccl")
  164. args.rank = get_rank()
  165. args.group_size = get_group_size()
  166. device_num = args.group_size
  167. context.reset_auto_parallel_context()
  168. context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
  169. mirror_mean=True)
  170. else:
  171. context.set_context(device_id=args.device_id)
  172. context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
  173. # select for master rank save ckpt or all rank save, compatiable for model parallel
  174. args.rank_save_ckpt_flag = 0
  175. if args.is_save_on_master:
  176. if args.rank == 0:
  177. args.rank_save_ckpt_flag = 1
  178. else:
  179. args.rank_save_ckpt_flag = 1
  180. # logger
  181. args.outputs_dir = os.path.join(args.ckpt_path,
  182. datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  183. args.logger = get_logger(args.outputs_dir, args.rank)
  184. if args.dataset == "cifar10":
  185. dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size)
  186. else:
  187. dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size,
  188. args.rank, args.group_size)
  189. batch_num = dataset.get_dataset_size()
  190. args.steps_per_epoch = dataset.get_dataset_size()
  191. args.logger.save_args(args)
  192. # network
  193. args.logger.important_info('start create network')
  194. # get network and init
  195. network = vgg16(args.num_classes, args)
  196. # pre_trained
  197. if args.pre_trained:
  198. load_param_into_net(network, load_checkpoint(args.pre_trained))
  199. # lr scheduler
  200. if args.lr_scheduler == 'exponential':
  201. lr = warmup_step_lr(args.lr,
  202. args.lr_epochs,
  203. args.steps_per_epoch,
  204. args.warmup_epochs,
  205. args.max_epoch,
  206. gamma=args.lr_gamma,
  207. )
  208. elif args.lr_scheduler == 'cosine_annealing':
  209. lr = warmup_cosine_annealing_lr(args.lr,
  210. args.steps_per_epoch,
  211. args.warmup_epochs,
  212. args.max_epoch,
  213. args.T_max,
  214. args.eta_min)
  215. elif args.lr_scheduler == 'step':
  216. lr = lr_steps(0, lr_init=args.lr_init, lr_max=args.lr_max, warmup_epochs=args.warmup_epochs,
  217. total_epochs=args.max_epoch, steps_per_epoch=batch_num)
  218. else:
  219. raise NotImplementedError(args.lr_scheduler)
  220. # optimizer
  221. opt = Momentum(params=get_param_groups(network),
  222. learning_rate=Tensor(lr),
  223. momentum=args.momentum,
  224. weight_decay=args.weight_decay,
  225. loss_scale=args.loss_scale)
  226. if args.dataset == "cifar10":
  227. loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
  228. model = Model(network, loss_fn=loss, optimizer=opt, metrics={'acc'},
  229. amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
  230. else:
  231. if not args.label_smooth:
  232. args.label_smooth_factor = 0.0
  233. loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)
  234. loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
  235. model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2")
  236. # checkpoint save
  237. progress_cb = ProgressMonitor(args)
  238. callbacks = [progress_cb,]
  239. if args.rank_save_ckpt_flag:
  240. ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
  241. keep_checkpoint_max=args.ckpt_save_max)
  242. ckpt_cb = ModelCheckpoint(config=ckpt_config,
  243. directory=args.outputs_dir,
  244. prefix='{}'.format(args.rank))
  245. callbacks.append(ckpt_cb)
  246. model.train(args.max_epoch, dataset, callbacks=callbacks)