You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convert.py 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Parse tensor files from async dump structure."""
  16. import os
  17. import stat
  18. import sys
  19. from collections import namedtuple
  20. from importlib import import_module
  21. from pathlib import Path
  22. import numpy as np
  23. PARSE_ARGS_FIELDS = ['dump_path', 'format', 'output_path', 'output_file_type',
  24. 'input', 'output', 'shape',
  25. 'custom_script_path', 'dump_version']
  26. class ArgsParser(namedtuple("ArgsParser", PARSE_ARGS_FIELDS)):
  27. """Args Parser object."""
  28. __slots__ = ()
  29. def __new__(cls, **kwargs):
  30. new_kwargs = {field: kwargs.get(field) for field in PARSE_ARGS_FIELDS}
  31. new_kwargs['dump_version'] = kwargs.get('dump_version', '2.0')
  32. return super().__new__(cls, **new_kwargs)
  33. def load_hisi_tools(msaccucmp_path=None):
  34. """
  35. Load HISI tools.
  36. Args:
  37. msaccucmp_path (Path): The path object of msaccucmp.py path.
  38. Returns:
  39. tuple, the tuple of utils, common, shape_conversion module in toolkit package.
  40. """
  41. msaccucmp_path = get_msaccucmp_path(msaccucmp_path)
  42. hisi_tool_path = msaccucmp_path.parent
  43. if str(hisi_tool_path) not in sys.path:
  44. sys.path.append(str(hisi_tool_path))
  45. try:
  46. hisi_utils = import_module('utils')
  47. hisi_common = import_module('common')
  48. hisi_format_conversion = import_module('shape_conversion').FormatConversionMain
  49. except ModuleNotFoundError:
  50. raise ModuleNotFoundError(f'Failed to load HISI tools under {msaccucmp_path}')
  51. return hisi_utils, hisi_common, hisi_format_conversion
  52. def get_msaccucmp_path(msaccucmp_path=None):
  53. """
  54. Get the Path of HISI msaccucmp file.
  55. Args:
  56. msaccucmp_path (str): The path of `msaccucmp.py` or `msaccucmp.pyc`. Default: None.
  57. Returns:
  58. Path, the msaccucmp.py file path object.
  59. """
  60. if msaccucmp_path is not None:
  61. msaccucmp_path = Path(msaccucmp_path).resolve()
  62. if not msaccucmp_path.exists():
  63. raise FileNotFoundError(f"File {msaccucmp_path} doesn't exists. Please check the input value.")
  64. return msaccucmp_path
  65. # search msaccucmp file under $ASCEND_AICPU_PATH
  66. ascend_aicpu_path = os.environ.get('ASCEND_AICPU_PATH')
  67. if not ascend_aicpu_path:
  68. raise FileNotFoundError("Failed to find $ASCEND_AICPU_PATH parameter in environment. Please make sure you have"
  69. "install run packages and set the environment correctly.")
  70. ascend_aicpu_path = Path(ascend_aicpu_path).resolve()
  71. msaccucmp_files = list(ascend_aicpu_path.rglob('msaccucmp.py*'))
  72. if not msaccucmp_files:
  73. raise FileNotFoundError(f"Failed to find msaccucmp.py or msaccucmp.pyc file under {ascend_aicpu_path}. Please"
  74. f"make sure you have install toolkit package successfully.")
  75. return msaccucmp_files[0]
  76. class DumpRootDirConverter:
  77. """Convert the async dump data under dump root directory into host format."""
  78. def __init__(self, data_loader, msaccucmp_path=None):
  79. self.data_loader = data_loader
  80. self.dump_data_dir = Path(data_loader.get_net_dir())
  81. self.failed_summary_file = self.dump_data_dir.joinpath('convert_failed_files_summary.txt')
  82. self._hisi_tools = load_hisi_tools(msaccucmp_path)
  83. self.check_async_dir()
  84. def check_async_dir(self):
  85. """Check if this directory is dumped asynchronously on Ascend."""
  86. is_sync = self.data_loader.get_sync_flag()
  87. if is_sync:
  88. raise ValueError(f"The data under {str(self.dump_data_dir)} is not dumped asynchronously.")
  89. def convert(self):
  90. """Convert dump data under root dump data directory from device format to host format."""
  91. source_iterations = self.dump_data_dir.glob(f'device_[0-9]*/*_graph_[0-9]*/[0-9]*/[0-9]*/')
  92. failed_lines = []
  93. if self.failed_summary_file.is_file():
  94. self.failed_summary_file.unlink()
  95. for iter_path in source_iterations:
  96. dump_path = str(iter_path)
  97. res = DirConvert(dump_path=dump_path, output_path=dump_path, hisi_tools=self._hisi_tools).convert()
  98. failed_lines.extend(res)
  99. # add tensor format in file name
  100. if failed_lines:
  101. self.save_failed_fines(failed_lines)
  102. return failed_lines
  103. def save_failed_fines(self, failed_lines):
  104. """Save failed fines to file."""
  105. with self.failed_summary_file.open('w') as handler:
  106. for line in failed_lines:
  107. handler.write(line + '\n')
  108. self.failed_summary_file.chmod(stat.S_IRUSR)
  109. hisi_utils = self._hisi_tools[0]
  110. hisi_utils.print_info_log(f"Failed summary has saved to {str(self.failed_summary_file)}")
  111. class DirConvert:
  112. """Convert the async dump data under one directory into host format."""
  113. def __init__(self, dump_path, output_path, target_format='NCHW', output_file_type='npy', hisi_tools=None):
  114. self.args_parser = ArgsParser(dump_path=dump_path,
  115. format=target_format,
  116. output_path=output_path,
  117. output_file_type=output_file_type)
  118. self.output_path = Path(output_path).absolute()
  119. self.failed_file_path = self.output_path.joinpath('convert_failed_file_list.txt')
  120. self.hisi_utils, self.hisi_common, self.hisi_format_conversion = load_hisi_tools() \
  121. if hisi_tools is None else hisi_tools
  122. def _is_npy_target(self):
  123. """Check if the output_file type is npy."""
  124. return self.args_parser.output_file_type == 'npy'
  125. def clean_old_files(self):
  126. """Clean old files."""
  127. # clean failed file record
  128. if self.failed_file_path.is_file():
  129. self.failed_file_path.unlink()
  130. # clean old converted data.
  131. old_data_files = self.output_path.glob(f'*.{self.args_parser.output_file_type}')
  132. for file in old_data_files:
  133. file.unlink()
  134. def convert(self):
  135. """Convert async dump data of src_dir to target_format and saved in output_dir."""
  136. conversion = self.hisi_format_conversion(self.args_parser)
  137. self.clean_old_files()
  138. failed_lines = []
  139. ret = conversion.convert_format()
  140. self.rename_generated_npy_file()
  141. if ret != self.hisi_utils.VECTOR_COMPARISON_NONE_ERROR:
  142. self.hisi_utils.print_info_log(
  143. f"Begin to convert failed operator in {str(self.failed_file_path)} one by one.")
  144. failed_lines = self.convert_failed_tensors()
  145. else:
  146. self.hisi_utils.print_info_log(
  147. f"All tensor under {self.args_parser.dump_path} have been converted to {self.output_path} "
  148. f"successfully.")
  149. return failed_lines
  150. def rename_generated_npy_file(self):
  151. """Rename the npy file generated by HISI tool to MS file name format."""
  152. # before change
  153. # file name: {op_type}.{op_name_with_scope}.{task_id}(.stream_id).{timestamp}.{tensor_type}.{slot}.{shape}.npy
  154. # after change
  155. # file name: {op_type}.{op_name}.{task_id}(.stream_id).{timestamp}.{tensor_type}.{slot}.{format}.npy
  156. if not self._is_npy_target():
  157. return
  158. self.hisi_utils.print_info_log(
  159. f"Start to rename npy files under {self.output_path}")
  160. target_format = self.args_parser.format
  161. old_data_files = self.output_path.glob('*.npy')
  162. for file in old_data_files:
  163. name_splits = file.name.split('.')
  164. name_splits[1] = name_splits[1].split('_')[-1]
  165. name_splits[-2] = target_format
  166. new_file_name = '.'.join(name_splits)
  167. file.chmod(stat.S_IRUSR)
  168. file.rename(file.with_name(new_file_name))
  169. def convert_failed_tensors(self):
  170. """Convert failed tensors from failed txt."""
  171. failed_lines = []
  172. if not self.failed_file_path.is_file():
  173. return failed_lines
  174. os.chmod(self.failed_file_path, stat.S_IRUSR)
  175. with self.failed_file_path.open() as handler:
  176. failed_line = handler.readline().strip('\n')
  177. while failed_line:
  178. try:
  179. self.convert_operator_by_failed_line(failed_line)
  180. except (ValueError, OSError, AttributeError) as err:
  181. self.hisi_utils.print_error_log(f'Failed to convert {failed_line} to Host format. \n {str(err)}')
  182. failed_lines.append(failed_line)
  183. failed_line = handler.readline().strip('\n')
  184. if failed_lines:
  185. self.hisi_utils.print_error_log(f"Failed to convert: {failed_lines}")
  186. self.hisi_utils.print_info_log("Finish convert failed operators to host format.")
  187. return failed_lines
  188. def convert_operator_by_failed_line(self, failed_line):
  189. """Convert operator by failed line."""
  190. fields = failed_line.split(',')
  191. if len(fields) > 1:
  192. op_file = fields[0]
  193. op_data = self.hisi_utils.parse_dump_file(op_file, self.args_parser.dump_version)
  194. missing_tensors = fields[1:]
  195. for missing_tensor in missing_tensors:
  196. tensor_type, idx = missing_tensor.split(':')
  197. idx = int(idx)
  198. tensor = getattr(op_data, tensor_type)[idx]
  199. dump_data_array = self.get_tensor_numpy_value(tensor)
  200. self.save_tensor_file(op_file, tensor_type, idx, tensor, dump_data_array)
  201. def get_tensor_numpy_value(self, tensor):
  202. """Convert tensor from device format to host format."""
  203. dump_data_array = self.hisi_utils.deserialize_dump_data_to_array(tensor)
  204. array = dump_data_array.reshape(tensor.shape.dim)
  205. return array
  206. def save_tensor_file(self, op_file, tensor_type, idx, tensor, dump_data_array):
  207. """
  208. Save tensor file.
  209. Args:
  210. op_file (str): Source operator file path.
  211. tensor_type (str): The tensor type of the operator, `input` or `output`.
  212. idx (int): Tensor slot index.
  213. tensor (TensorProto): Tensor data in proto format.
  214. dump_data_array (numpy.array): Tensor data in numpy format.
  215. """
  216. op_name = os.path.basename(op_file)
  217. # shorten the op_name to meet the linux file name len limit.
  218. op_name = self._remove_scope_in_op_name(op_name)
  219. if self._is_npy_target():
  220. self._save_tensor_in_npy(op_name, tensor_type, idx, tensor, dump_data_array)
  221. else:
  222. self._save_tensor_in_bin(op_name, tensor_type, idx, tensor, dump_data_array)
  223. @staticmethod
  224. def _remove_scope_in_op_name(op_name):
  225. """Remove scope in operation name."""
  226. name_splits = op_name.split('.')
  227. node_name = name_splits[1]
  228. name_splits[1] = node_name.split('_')[-1]
  229. return '.'.join(name_splits)
  230. def _save_tensor_in_npy(self, op_name, tensor_type, idx, tensor, dump_data_array):
  231. """
  232. Save tensor file in `npy` format.
  233. Args:
  234. op_name (str): Operator name without scope.
  235. tensor_type (str): The tensor type of the operator, `input` or `output`.
  236. idx (int): Tensor slot index.
  237. tensor (TensorProto): Tensor data in proto format.
  238. dump_data_array (numpy.array): Tensor data in numpy format.
  239. """
  240. out_file_name = "%s.%s.%d.%s.npy" % (
  241. op_name,
  242. tensor_type,
  243. idx,
  244. self.hisi_common.get_format_string(tensor.format)
  245. )
  246. out_path = os.path.join(self.args_parser.output_path, out_file_name)
  247. np.save(out_path, dump_data_array)
  248. os.chmod(out_path, stat.S_IRUSR)
  249. def _save_tensor_in_bin(self, op_name, tensor_type, idx, tensor, dump_data_array):
  250. """
  251. Save tensor file in `bin` format.
  252. Args:
  253. op_name (str): Operator name without scope.
  254. tensor_type (str): The tensor type of the operator, `input` or `output`.
  255. idx (int): Tensor slot index.
  256. tensor (TensorProto): Tensor data in proto format.
  257. dump_data_array (numpy.array): Tensor data in numpy format.
  258. Returns:
  259. str, output tensor file name.
  260. """
  261. out_file_name = "%s.%s.%d.%s.%s.bin" % (
  262. op_name,
  263. tensor_type,
  264. idx,
  265. self.hisi_utils.get_string_from_list(dump_data_array.shape, 'x'),
  266. self.hisi_common.get_format_string(tensor.format),
  267. )
  268. out_path = os.path.join(self.args_parser.output_path, out_file_name)
  269. dump_data_array.tofile(out_path)
  270. os.chmod(out_path, stat.S_IRUSR)
  271. class FileMapping:
  272. """Mapping op pattern to files."""
  273. def __init__(self, data_loader):
  274. self.data_loader = data_loader
  275. self.output_path = Path(data_loader.get_net_dir()).absolute()
  276. def find_tensor_file(self, pattern, device_ids=None, iterations=None):
  277. """
  278. Find tensor files.
  279. Args:
  280. pattern (str): File name pattern.
  281. device_ids (Union[None, list[int]]): Filter condition of device id. Default: None.
  282. iterations (Union[None, list[int]]): Filter condition of iteration id. Default: None.
  283. Returns:
  284. dict, file paths.
  285. """
  286. op_path = OpPathManager(pattern)
  287. if device_ids is None:
  288. device_dirs = self.output_path.glob('device_[0-9]*')
  289. else:
  290. device_dirs = []
  291. for device_id in device_ids:
  292. device_dirs.append(self.output_path.joinpath(f'device_{device_id}'))
  293. for device_dir in device_dirs:
  294. op_device_obj = self.find_tensor_file_per_device(pattern, device_dir, iterations)
  295. op_path.add(op_device_obj)
  296. return op_path
  297. def find_tensor_file_per_device(self, pattern, device_dir, iterations):
  298. """
  299. Find tensor files per device directory.
  300. Args:
  301. pattern (str): File name pattern.
  302. device_dir (Union[Path, str]): Directory path.
  303. iterations (Union[None, list[int]]): Filter condition of iteration id. Default: None.
  304. Returns:
  305. OpDevicePath, operator file path object of one device.
  306. """
  307. device_dir = Path(device_dir)
  308. # device_name is like `device_{device_id}`
  309. device_id = int(device_dir.name.split('_')[-1])
  310. op_device_obj = OpDevicePath(device_id)
  311. def _find_by_iter_dirs(dirs):
  312. for iter_dir in dirs:
  313. op_path_per_iter = self.find_tensor_file_per_iter(pattern, iter_dir)
  314. op_device_obj.add(op_path_per_iter)
  315. if iterations is None:
  316. iter_dirs = device_dir.glob('*_graph_[0-9]*/[0-9]*/[0-9]*')
  317. _find_by_iter_dirs(iter_dirs)
  318. else:
  319. for iteration in iterations:
  320. iter_dirs = device_dir.glob(f'*_graph_[0-9]*/[0-9]*/{iteration}')
  321. _find_by_iter_dirs(iter_dirs)
  322. return op_device_obj
  323. @staticmethod
  324. def find_tensor_file_per_iter(pattern, iter_dir):
  325. """
  326. Find tensor files per iteration directory.
  327. Args:
  328. pattern (str): File name pattern.
  329. iter_dir (Union[Path, str]): Iteration path.
  330. Returns:
  331. OpPath, the operator file path object of one iteration.
  332. """
  333. dir_path = Path(iter_dir)
  334. def _get_file_generator(tensor_type):
  335. return dir_path.glob(f'*{pattern}.*{tensor_type}.[0-9]*.npy')
  336. in_gen = _get_file_generator('input')
  337. out_gen = _get_file_generator('output')
  338. iteration = int(dir_path.name)
  339. op_path_obj = OpPath(iteration, in_gen, out_gen)
  340. return op_path_obj
  341. class OpPathManager:
  342. """The manager of tensor files of one operator."""
  343. def __init__(self, pattern, op_full_name=None):
  344. self.pattern = pattern
  345. self.op_full_name = op_full_name
  346. self._op_path = {}
  347. @property
  348. def devices(self):
  349. """Get list of iterations in cache."""
  350. return list(self._op_path.keys())
  351. def add(self, op_device_path):
  352. """Add OpDevicePath object."""
  353. self._op_path[op_device_path.device_id] = op_device_path
  354. def device(self, device_id):
  355. """Get OpDevicePath object according to device id."""
  356. return self._op_path.get(device_id)
  357. def to_dict(self):
  358. """Get operator files of all devices in dict format."""
  359. res = {}
  360. for device_id, op_path in self._op_path.items():
  361. key = f'device_{device_id}'
  362. res[key] = op_path.to_dict()
  363. return res
  364. class OpDevicePath:
  365. """The operator file object of specific device."""
  366. def __init__(self, device_id):
  367. self._device_id = device_id
  368. # record the operation path object of different iteration
  369. # the format is like <int, OpPath>
  370. self._op_path = {}
  371. @property
  372. def device_id(self):
  373. """The property of device id."""
  374. return self._device_id
  375. @property
  376. def iterations(self):
  377. """Get list of iterations in cache."""
  378. return list(self._op_path.keys())
  379. def iteration(self, iteration):
  380. """Get the op path object according to iteration."""
  381. return self._op_path.get(iteration)
  382. def add(self, op_path):
  383. """Add OpPath object."""
  384. self._op_path[op_path.iteration] = op_path
  385. def to_dict(self):
  386. """Get operator files of one device in dict format."""
  387. res = {}
  388. for iteration, op_path in self._op_path.items():
  389. res[iteration] = op_path.to_dict()
  390. return res
  391. class OpPath:
  392. """The operator file object of specific iteration."""
  393. def __init__(self, iteration, input_gen, output_gen):
  394. self._iter = iteration
  395. self._input_files = None
  396. self._input_gen = input_gen
  397. self._output_files = None
  398. self._output_gen = output_gen
  399. @staticmethod
  400. def _convert_path_gen_to_list(path_gen):
  401. """Convert generator of Path.glob to list of string."""
  402. return [str(path) for path in path_gen]
  403. @property
  404. def inputs(self):
  405. """The list of input tensor files."""
  406. if self._input_files is None:
  407. self._input_files = self._convert_path_gen_to_list(self._input_gen)
  408. return self._input_files
  409. @property
  410. def outputs(self):
  411. """The list of output tensor file paths."""
  412. if self._output_files is None:
  413. self._output_files = self._convert_path_gen_to_list(self._output_gen)
  414. return self._output_files
  415. @property
  416. def iteration(self):
  417. """The iteration of the tensor file."""
  418. return self._iter
  419. def to_dict(self):
  420. """Get operator files of one iteration in dict format."""
  421. res = {
  422. 'input': self.inputs,
  423. 'output': self.outputs
  424. }
  425. return res