You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling.py 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Profiling api file."""
  16. import os
  17. import time
  18. from marshmallow import ValidationError
  19. from tabulate import tabulate
  20. from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
  21. from mindinsight.profiler.analyser.integrator import Integrator
  22. from mindinsight.profiler.common._utils import get_file_names, fwrite_format
  23. from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
  24. ProfilerIOException
  25. from mindinsight.profiler.common.log import logger
  26. from mindinsight.profiler.common.validator.checkparam import \
  27. check_bool, check_subgraph
  28. from mindinsight.profiler.common.validator.validate_path import \
  29. validate_and_normalize_path
  30. from mindinsight.profiler.parser.aicpu_data_parser import DataPreProcessParser
  31. from mindinsight.profiler.parser.framework_parser import FrameworkParser
  32. from mindinsight.profiler.parser.hwts_log_parser import HWTSLogParser
  33. from mindinsight.profiler.parser.minddata_parser import MinddataParser
  34. from mindinsight.profiler.parser.minddata_pipeline_parser import \
  35. MinddataPipelineParser
  36. from mindinsight.profiler.parser.optime_parser import OPComputeTimeParser
  37. from mindinsight.profiler.parser.step_trace_parser import StepTraceParser
  38. from mindinsight.utils.exceptions import MindInsightException
  39. PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling"
  40. INIT_OP_NAME = 'Default/InitDataSetQueue'
  41. class Profiler:
  42. """
  43. Performance profiling API.
  44. Enable MindSpore users to profile the performance of neural network.
  45. Args:
  46. subgraph (str): Define which subgraph to monitor and analyse, can be 'all', 'Default', 'Gradients'.
  47. is_detail (bool): Whether to show profiling data for op_instance level, only show optype level if False.
  48. is_show_op_path (bool): Whether to save the full path for each op instance.
  49. output_path (str): Output data path.
  50. optypes_to_deal (str): Op type names, the data of which optype should be collected and analysed,
  51. will deal with all op if null; Different op types should be seperated by comma.
  52. optypes_not_deal (str): Op type names, the data of which optype will not be collected and analysed;
  53. Different op types should be seperated by comma.
  54. Examples:
  55. >>> from mindinsight.profiler import Profiler
  56. >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
  57. >>> device_id=int(os.environ["DEVICE_ID"]))
  58. >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
  59. >>> model = Model(train_network)
  60. >>> dataset = get_dataset()
  61. >>> model.train(2, dataset)
  62. >>> profiler.analyse()
  63. """
  64. _base_profiling_container_path = "/var/log/npu/profiling/container"
  65. _hwts_output_filename_target = "output_format_data_hwts_"
  66. _opcompute_output_filename_target = "output_op_compute_time_"
  67. _aicpu_op_output_filename_target = "output_data_preprocess_aicpu_"
  68. def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data',
  69. optypes_to_deal='', optypes_not_deal='Variable', job_id=""):
  70. # get device_id and device_target
  71. self._get_devid_and_devtarget()
  72. self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id)
  73. data_path = os.path.join(self._container_path, "data")
  74. if not os.path.exists(data_path):
  75. os.makedirs(data_path, exist_ok=True)
  76. self._output_path = validate_and_normalize_path(output_path,
  77. 'Profiler output path (' + output_path + ')')
  78. self._output_path = os.path.join(self._output_path, "profiler")
  79. if not os.path.exists(self._output_path):
  80. os.makedirs(self._output_path, exist_ok=True)
  81. os.environ['PROFILING_MODE'] = 'true'
  82. os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
  83. os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
  84. os.environ['DEVICE_ID'] = self._dev_id
  85. # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
  86. try:
  87. import mindspore.context as context
  88. context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
  89. except ImportError:
  90. logger.error("Profiling: fail to import context from mindspore.")
  91. except ValueError:
  92. logger.error("Profiling: fail to set context enable_profiling")
  93. os.environ['AICPU_PROFILING_MODE'] = 'true'
  94. os.environ['PROFILING_DIR'] = str(self._container_path)
  95. self._subgraph = check_subgraph(subgraph)
  96. self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else []
  97. self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
  98. self._detail = check_bool(is_detail, 'is_detail')
  99. self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
  100. self._profiling_job_id = job_id
  101. # add job id env through user input later
  102. self._job_id_env = 0
  103. self._start_time = int(time.time() * 10000000)
  104. logger.info("Profiling: profiling start time: %d", self._start_time)
  105. def analyse(self):
  106. """
  107. Collect and analyse performance data, called after training or during training.
  108. Examples:
  109. >>> from mindinsight.profiler import Profiler
  110. >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
  111. >>> device_id=int(os.environ["DEVICE_ID"]))
  112. >>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
  113. >>> model = Model(train_network)
  114. >>> dataset = get_dataset()
  115. >>> model.train(2, dataset)
  116. >>> profiler.analyse()
  117. """
  118. try:
  119. from mindspore.communication.management import release
  120. release()
  121. except ImportError:
  122. logger.error("Profiling: fail to import release from mindspore.")
  123. job_id = self._get_profiling_job_id()
  124. logger.info("Profiling: job id is %s ", job_id)
  125. source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
  126. # parse hwts.log.data.45.dev file, and get task profiling data
  127. hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
  128. hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
  129. hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
  130. result = hwtslog_parser.execute()
  131. if not result:
  132. logger.error("Profiling: fail to parse hwts log file.")
  133. return
  134. # parse Framework file, and get the relation of op and tasks
  135. framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
  136. framework_parser.parse()
  137. op_task_dict = framework_parser.to_task_id_full_op_name_dict()
  138. if not op_task_dict:
  139. logger.error("Profiling: fail to parse framework files.")
  140. return
  141. # get op compute time from hwts data and framework data, write output_op_compute_time.txt
  142. opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
  143. opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
  144. optime_parser = OPComputeTimeParser(
  145. hwts_output_filename, opcompute_output_filename,
  146. op_task_dict, self._output_path, self._dev_id
  147. )
  148. optime_parser.execute()
  149. # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
  150. output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
  151. output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
  152. aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
  153. aicpu_data_parser.execute()
  154. # Parsing minddata AICPU profiling
  155. MinddataParser.execute(source_path, self._output_path, self._dev_id)
  156. # parse minddata pipeline operator and queue
  157. try:
  158. pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
  159. pipeline_parser.parse()
  160. except MindInsightException as err:
  161. logger.warning(err.message)
  162. # analyse op compute time info
  163. try:
  164. self._analyser_op_info()
  165. except MindInsightException as err:
  166. logger.warning(err.message)
  167. # analyse step trace info
  168. try:
  169. self._analyse_step_trace(source_path, framework_parser)
  170. except MindInsightException as err:
  171. logger.warning(err.message)
  172. # analyse timeline info
  173. try:
  174. self._analyse_timeline(aicpu_data_parser, optime_parser)
  175. except (ProfilerIOException, ProfilerFileNotFoundException, ValidationError) as err:
  176. logger.warning('Fail to write timeline data: %s', err)
  177. def _analyse_step_trace(self, source_path, framework_parser):
  178. """
  179. Analyse step trace data and save the result.
  180. Args:
  181. source_path (str): The directory that contains the step trace original data.
  182. framework_parser (FrameworkParser): The framework parse instance.
  183. """
  184. logger.info("Begin to parse step trace.")
  185. # construct output path
  186. step_trace_intermediate_file_path = os.path.join(
  187. self._output_path,
  188. f'step_trace_raw_{self._dev_id}_detail_time.csv'
  189. )
  190. point_info_file_path = os.path.join(
  191. self._output_path,
  192. 'step_trace_point_info.json'
  193. )
  194. # whether keep the first step
  195. skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
  196. point_info = framework_parser.point_info
  197. # parser the step trace files and save the result to disk
  198. parser = StepTraceParser(input_dir=source_path,
  199. output_file_path=step_trace_intermediate_file_path,
  200. job_id=self._job_id_env,
  201. skip_first_step=skip_first_step_flag)
  202. parser.update_tag_op_type_map(point_info)
  203. parser.parse_and_save()
  204. point_info = parser.record_point_info(point_info, point_info_file_path)
  205. # print parser result
  206. parser.show()
  207. logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path)
  208. logger.info("The point info is: %s", point_info)
  209. def _analyse_timeline(self, aicpu_parser, optime_parser):
  210. """
  211. Analyse and parse timeline info.
  212. Args:
  213. aicpu_parser (DataPreProcessParser): The parser instance for AI CPU operator
  214. execution time calculation.
  215. optime_parser (OPComputeTimeParserParser): The parser instance for AI Core
  216. operator execution time calculation.
  217. """
  218. timeline_analyser = AnalyserFactory.instance().get_analyser(
  219. 'timeline', self._output_path, self._dev_id
  220. )
  221. # Get framework info
  222. aicoredetail_analyser = AnalyserFactory.instance().get_analyser(
  223. 'aicore_detail', self._output_path, self._dev_id
  224. )
  225. framework_info = aicoredetail_analyser.query()
  226. # Get all reduce info
  227. step_trace_analyser = AnalyserFactory.instance().get_analyser(
  228. 'step_trace', self._output_path, self._dev_id
  229. )
  230. all_reduce_info = step_trace_analyser.query_for_all_reduce()
  231. # Get timeline info
  232. logger.info('Start writing timeline info...')
  233. logger.info('Warm Prompt: It could take a few minutes if you are training '
  234. 'with a complex network or more than 10 steps.')
  235. # Add info into timeline, such as AI CPU, AllReduce, framework info.
  236. aicpu_info = aicpu_parser.query_aicpu_data()
  237. min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
  238. timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info, min_cycle_counter)
  239. timeline_analyser.write_timeline()
  240. timeline_analyser.write_timeline_summary()
  241. def __del__(self):
  242. """Disable the profiling collection service, called after training."""
  243. os.environ['PROFILING_MODE'] = str("false")
  244. try:
  245. import mindspore.context as context
  246. context.set_context(enable_profiling=False)
  247. except ImportError:
  248. pass
  249. def _get_profiling_job_id(self):
  250. """Get profiling job id, which was generated by ada service.
  251. Returns:
  252. str: profiling jon id.
  253. """
  254. if self._profiling_job_id:
  255. return self._profiling_job_id
  256. job_id = ""
  257. cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'"
  258. r = os.popen(cmd)
  259. profiling_job_dirs = r.readlines()
  260. r.close()
  261. for item in profiling_job_dirs:
  262. path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip())
  263. log_file = get_file_names(path, "host_start.log")
  264. if not log_file:
  265. logger.error("Profiling: job path %s, host_start.log not exist.", path)
  266. continue
  267. log_file = os.path.join(path, log_file[0])
  268. item_dict = self._parse_host_start_log(log_file)
  269. if not item_dict:
  270. logger.error("Profiling: job path %s, fail to get job start info.", path)
  271. continue
  272. if self._start_time > int(item_dict["start_time"]):
  273. logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
  274. path, item_dict["start_time"], self._start_time)
  275. break
  276. if self._dev_id != item_dict["device_id"]:
  277. logger.info("Profiling: job path %s, dev id %s, training device id %s.",
  278. path, item_dict["device_id"], self._dev_id)
  279. continue
  280. job_id = item.strip()
  281. break
  282. if not job_id:
  283. msg = ("Fail to get profiling job, please check whether job dir was generated")
  284. raise RuntimeError(msg)
  285. return job_id
  286. def _parse_host_start_log(self, input_file):
  287. """
  288. Parse host start log file, get the device id and start time of the job.
  289. Args:
  290. input_file (str): The file path of the host start log file.
  291. Returns:
  292. dict, job start time and device id.
  293. """
  294. item_dict = {}
  295. for line in open(input_file):
  296. if "Device" in line:
  297. item_dict["device_id"] = line[7:len(line)-2]
  298. elif "clock_realtime" in line:
  299. item_dict["start_time"] = line[16:len(line)-3]
  300. return item_dict
  301. def _analyser_op_info(self):
  302. """Analyse the operator information."""
  303. integrator = Integrator(self._output_path, self._dev_id)
  304. integrator.integrate()
  305. aicore_type_result = self._query_op_type_info()
  306. detail_file_path = os.path.join(
  307. self._output_path,
  308. 'output_op_compute_time_detail_{}.txt'.format(self._dev_id)
  309. )
  310. fwrite_format(detail_file_path, data_source='title:op compute time')
  311. display_names = [
  312. 'optype_name', 'compute_time(ms, per-step)',
  313. 'called_times(per-step)', 'percent'
  314. ]
  315. data_source = tabulate(aicore_type_result, display_names)
  316. fwrite_format(detail_file_path, data_source=data_source, is_print=True)
  317. if self._detail:
  318. op_type_order = [item[0] for item in aicore_type_result]
  319. aicore_detail_result = self._query_op_detail_info(op_type_order)
  320. fwrite_format(detail_file_path, data_source='', is_print=True)
  321. fwrite_format(detail_file_path, data_source='Detail:', is_print=True)
  322. data_source = tabulate(
  323. aicore_detail_result.get('object'),
  324. aicore_detail_result.get('col_name')
  325. )
  326. fwrite_format(detail_file_path, data_source=data_source, is_print=True)
  327. def _query_op_type_info(self):
  328. """
  329. Query AICORE operator type information.
  330. Returns:
  331. list[list], the AICORE operator type and execution time information.
  332. """
  333. condition = {
  334. 'sort_condition': {
  335. 'name': 'execution_time',
  336. 'type': 'descending'
  337. }
  338. }
  339. analyser = AnalyserFactory.instance().get_analyser(
  340. 'aicore_type', self._output_path, self._dev_id
  341. )
  342. result = analyser.query(condition)
  343. return result.get('object')
  344. def _query_op_detail_info(self, op_type_order):
  345. """
  346. Query AICORE operator detail information.
  347. Args:
  348. op_type_order(list): The name of the op type in order.
  349. Returns:
  350. dict, the AICORE operator detail information.
  351. """
  352. op_type_condition = {}
  353. if self._valid_optype_name:
  354. op_type_condition['in'] = self._valid_optype_name
  355. if self._filt_optype_names:
  356. op_type_condition['not_in'] = self._filt_optype_names
  357. subgraph_condition = {}
  358. if self._subgraph != 'all':
  359. subgraph_condition['in'] = [self._subgraph]
  360. filter_condition = {
  361. 'op_type': op_type_condition,
  362. 'subgraph': subgraph_condition,
  363. 'is_display_detail': False,
  364. 'is_display_full_op_name': self._withfullpath
  365. }
  366. analyser = AnalyserFactory.instance().get_analyser(
  367. 'aicore_detail', self._output_path, self._dev_id
  368. )
  369. result = analyser.query_and_sort_by_op_type(
  370. filter_condition, op_type_order
  371. )
  372. return result
  373. def _get_devid_and_devtarget(self):
  374. """Get device id and target of this training."""
  375. device_target = ""
  376. dev_id = ""
  377. try:
  378. import mindspore.context as context
  379. dev_id = str(context.get_context("device_id"))
  380. device_target = context.get_context("device_target")
  381. except ImportError:
  382. logger.error("Profiling: fail to import context from mindspore.")
  383. except ValueError as err:
  384. logger.error("Profiling: fail to get context, %s", err)
  385. if not dev_id or not dev_id.isdigit():
  386. dev_id = os.getenv('DEVICE_ID')
  387. if not dev_id or not dev_id.isdigit():
  388. dev_id = "0"
  389. logger.error("Fail to get DEVICE_ID, use 0 instead.")
  390. if device_target and device_target != "Davinci" \
  391. and device_target != "Ascend":
  392. msg = ("Profiling: unsupport backend: %s" \
  393. % device_target)
  394. raise RuntimeError(msg)
  395. self._dev_id = dev_id