You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convert.py 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Parse tensor files from async dump structure."""
  16. import os
  17. import stat
  18. import sys
  19. from collections import namedtuple
  20. from importlib import import_module
  21. from pathlib import Path
  22. import numpy as np
  23. PARSE_ARGS_FIELDS = ['dump_path', 'format', 'output_path', 'output_file_type',
  24. 'input', 'output', 'shape',
  25. 'custom_script_path', 'dump_version']
  26. class ArgsParser(namedtuple("ArgsParser", PARSE_ARGS_FIELDS)):
  27. """Args Parser object."""
  28. __slots__ = ()
  29. def __new__(cls, **kwargs):
  30. new_kwargs = {field: kwargs.get(field) for field in PARSE_ARGS_FIELDS}
  31. new_kwargs['dump_version'] = kwargs.get('dump_version', '2.0')
  32. return super().__new__(cls, **new_kwargs)
  33. def load_hisi_tools(msaccucmp_path=None):
  34. """
  35. Load HISI tools.
  36. Args:
  37. msaccucmp_path (Path): The path object of msaccucmp.py path.
  38. Returns:
  39. tuple, the tuple of utils, common, shape_conversion module in toolkit package.
  40. """
  41. msaccucmp_path = get_msaccucmp_path(msaccucmp_path)
  42. hisi_tool_path = msaccucmp_path.parent
  43. if str(hisi_tool_path) not in sys.path:
  44. sys.path.append(str(hisi_tool_path))
  45. try:
  46. hisi_utils = import_module('utils')
  47. hisi_common = import_module('common')
  48. hisi_format_conversion = import_module('shape_conversion').FormatConversionMain
  49. except ModuleNotFoundError:
  50. raise ModuleNotFoundError(f'Failed to load HISI tools under {msaccucmp_path}')
  51. return hisi_utils, hisi_common, hisi_format_conversion
  52. def get_msaccucmp_path(msaccucmp_path=None):
  53. """
  54. Get the Path of HISI msaccucmp file.
  55. Args:
  56. msaccucmp_path (str): The path of `msaccucmp.py` or `msaccucmp.pyc`. Default: None.
  57. Returns:
  58. Path, the msaccucmp.py file path object.
  59. """
  60. if msaccucmp_path is not None:
  61. msaccucmp_path = Path(msaccucmp_path).resolve()
  62. if not msaccucmp_path.exists():
  63. raise FileNotFoundError(f"File {msaccucmp_path} doesn't exists. Please check the input value.")
  64. return msaccucmp_path
  65. # search msaccucmp file under $ASCEND_AICPU_PATH
  66. ascend_aicpu_path = os.environ.get('ASCEND_AICPU_PATH')
  67. if not ascend_aicpu_path:
  68. raise FileNotFoundError("Failed to find $ASCEND_AICPU_PATH parameter in environment. Please make sure you have"
  69. "install run packages and set the environment correctly.")
  70. ascend_aicpu_path = Path(ascend_aicpu_path).resolve()
  71. msaccucmp_files = list(ascend_aicpu_path.rglob('msaccucmp.py*'))
  72. if not msaccucmp_files:
  73. raise FileNotFoundError(f"Failed to find msaccucmp.py or msaccucmp.pyc file under {ascend_aicpu_path}. Please"
  74. f"make sure you have install toolkit package successfully.")
  75. return msaccucmp_files[0]
  76. class DumpRootDirConverter:
  77. """Convert the async dump data under dump root directory into host format."""
  78. def __init__(self, data_loader, msaccucmp_path=None):
  79. self.data_loader = data_loader
  80. self.dump_data_dir = Path(data_loader.get_net_dir())
  81. self.failed_summary_file = self.dump_data_dir.joinpath('convert_failed_files_summary.txt')
  82. self._hisi_tools = load_hisi_tools(msaccucmp_path)
  83. self.check_async_dir()
  84. def check_async_dir(self):
  85. """Check if this directory is dumped asynchronously on Ascend."""
  86. is_sync = self.data_loader.get_sync_flag()
  87. if is_sync:
  88. raise ValueError(f"The data under {str(self.dump_data_dir)} is not dumped asynchronously.")
  89. def convert(self):
  90. """Convert dump data under root dump data directory from device format to host format."""
  91. source_iterations = self.dump_data_dir.glob(f'device_[0-9]*/*_graph_[0-9]*/[0-9]*/[0-9]*/')
  92. failed_lines = []
  93. if self.failed_summary_file.is_file():
  94. self.failed_summary_file.unlink()
  95. for iter_path in source_iterations:
  96. dump_path = str(iter_path)
  97. res = DirConvert(dump_path=dump_path, output_path=dump_path, hisi_tools=self._hisi_tools).convert()
  98. failed_lines.extend(res)
  99. # add tensor format in file name
  100. if failed_lines:
  101. self.save_failed_fines(failed_lines)
  102. return failed_lines
  103. def save_failed_fines(self, failed_lines):
  104. """Save failed fines to file."""
  105. with self.failed_summary_file.open('w') as handler:
  106. for line in failed_lines:
  107. handler.write(line + '\n')
  108. self.failed_summary_file.chmod(stat.S_IRUSR)
  109. hisi_utils = self._hisi_tools[0]
  110. hisi_utils.print_info_log(f"Failed summary has saved to {str(self.failed_summary_file)}")
  111. class DirConvert:
  112. """Convert the async dump data under one directory into host format."""
  113. def __init__(self, dump_path, output_path, target_format='NCHW', output_file_type='npy', hisi_tools=None):
  114. self.args_parser = ArgsParser(dump_path=dump_path,
  115. format=target_format,
  116. output_path=output_path,
  117. output_file_type=output_file_type)
  118. self.output_path = Path(output_path).absolute()
  119. self.failed_file_path = self.output_path.joinpath('convert_failed_file_list.txt')
  120. self.hisi_utils, self.hisi_common, self.hisi_format_conversion = load_hisi_tools() \
  121. if hisi_tools is None else hisi_tools
  122. def _is_npy_target(self):
  123. """Check if the output_file type is npy."""
  124. return self.args_parser.output_file_type == 'npy'
  125. def clean_old_files(self):
  126. """Clean old files."""
  127. # clean failed file record
  128. if self.failed_file_path.is_file():
  129. self.failed_file_path.unlink()
  130. # clean old converted data.
  131. old_data_files = self.output_path.glob(f'*.{self.args_parser.output_file_type}')
  132. for file in old_data_files:
  133. file.unlink()
  134. def convert(self):
  135. """Convert async dump data of src_dir to target_format and saved in output_dir."""
  136. conversion = self.hisi_format_conversion(self.args_parser)
  137. self.clean_old_files()
  138. failed_lines = []
  139. ret = conversion.convert_format()
  140. self.rename_generated_npy_file()
  141. if ret != self.hisi_utils.VECTOR_COMPARISON_NONE_ERROR:
  142. self.hisi_utils.print_info_log(
  143. f"Begin to convert failed operator in {str(self.failed_file_path)} one by one.")
  144. failed_lines = self.convert_failed_tensors()
  145. else:
  146. self.hisi_utils.print_info_log(
  147. f"All tensor under {self.args_parser.dump_path} have been converted to {self.output_path} "
  148. f"successfully.")
  149. return failed_lines
  150. def rename_generated_npy_file(self):
  151. """Rename the npy file generated by HISI tool to MS file name format."""
  152. # before change, the file name is format like:
  153. # {op_type}.{op_name_with_scope}.{task_id}(.stream_id).{timestamp}.{tensor_type}.{slot}.{shape}.npy
  154. # after change, the file name is format like:
  155. # {op_type}.{op_name}.{task_id}(.stream_id).{timestamp}.{tensor_type}.{slot}.{format}.npy
  156. if not self._is_npy_target():
  157. return
  158. self.hisi_utils.print_info_log(
  159. f"Start to rename npy files under {self.output_path}")
  160. target_format = self.args_parser.format
  161. old_data_files = self.output_path.glob('*.npy')
  162. for file in old_data_files:
  163. name_splits = file.name.split('.')
  164. name_splits[1] = name_splits[1].split('_')[-1]
  165. name_splits[-2] = target_format
  166. new_file_name = '.'.join(name_splits)
  167. file.rename(file.with_name(new_file_name))
  168. def convert_failed_tensors(self):
  169. """Convert failed tensors from failed txt."""
  170. failed_lines = []
  171. if not self.failed_file_path.is_file():
  172. return failed_lines
  173. with self.failed_file_path.open() as handler:
  174. failed_line = handler.readline().strip('\n')
  175. while failed_line:
  176. try:
  177. self.convert_operator_by_failed_line(failed_line)
  178. except (ValueError, OSError, AttributeError) as err:
  179. self.hisi_utils.print_error_log(f'Failed to convert {failed_line} to Host format. \n {str(err)}')
  180. failed_lines.append(failed_line)
  181. failed_line = handler.readline().strip('\n')
  182. if failed_lines:
  183. self.hisi_utils.print_error_log(f"Failed to convert: {failed_lines}")
  184. self.hisi_utils.print_info_log("Finish convert failed operators to host format.")
  185. return failed_lines
  186. def convert_operator_by_failed_line(self, failed_line):
  187. """Convert operator by failed line."""
  188. fields = failed_line.split(',')
  189. if len(fields) > 1:
  190. op_file = fields[0]
  191. op_data = self.hisi_utils.parse_dump_file(op_file, self.args_parser.dump_version)
  192. missing_tensors = fields[1:]
  193. for missing_tensor in missing_tensors:
  194. tensor_type, idx = missing_tensor.split(':')
  195. idx = int(idx)
  196. tensor = op_data.input[idx] if tensor_type == 'input' else op_data.output[idx]
  197. dump_data_array = self.get_tensor_numpy_value(tensor)
  198. self.save_tensor_file(op_file, tensor_type, idx, tensor, dump_data_array)
  199. def get_tensor_numpy_value(self, tensor):
  200. """Convert tensor from device format to host format."""
  201. dump_data_array = self.hisi_utils.deserialize_dump_data_to_array(tensor)
  202. array = dump_data_array.reshape(tensor.shape.dim)
  203. return array
  204. def save_tensor_file(self, op_file, tensor_type, idx, tensor, dump_data_array):
  205. """
  206. Save tensor file.
  207. Args:
  208. op_file (str): Source operator file path.
  209. tensor_type (str): The tensor type of the operator, `input` or `output`.
  210. idx (int): Tensor slot index.
  211. tensor (TensorProto): Tensor data in proto format.
  212. dump_data_array (numpy.array): Tensor data in numpy format.
  213. """
  214. op_name = os.path.basename(op_file)
  215. # shorten the op_name to meet the linux file name len limit.
  216. op_name = self._remove_scope_in_op_name(op_name)
  217. if self._is_npy_target():
  218. self._save_tensor_in_npy(op_name, tensor_type, idx, tensor, dump_data_array)
  219. else:
  220. self._save_tensor_in_bin(op_name, tensor_type, idx, tensor, dump_data_array)
  221. @staticmethod
  222. def _remove_scope_in_op_name(op_name):
  223. """Remove scope in operation name."""
  224. name_splits = op_name.split('.')
  225. node_name = name_splits[1]
  226. name_splits[1] = node_name.split('_')[-1]
  227. return '.'.join(name_splits)
  228. def _save_tensor_in_npy(self, op_name, tensor_type, idx, tensor, dump_data_array):
  229. """
  230. Save tensor file in `npy` format.
  231. Args:
  232. op_name (str): Operator name without scope.
  233. tensor_type (str): The tensor type of the operator, `input` or `output`.
  234. idx (int): Tensor slot index.
  235. tensor (TensorProto): Tensor data in proto format.
  236. dump_data_array (numpy.array): Tensor data in numpy format.
  237. """
  238. out_file_name = "%s.%s.%d.%s.npy" % (
  239. op_name,
  240. tensor_type,
  241. idx,
  242. self.hisi_common.get_format_string(tensor.format)
  243. )
  244. out_path = os.path.join(self.args_parser.output_path, out_file_name)
  245. np.save(out_path, dump_data_array)
  246. def _save_tensor_in_bin(self, op_name, tensor_type, idx, tensor, dump_data_array):
  247. """
  248. Save tensor file in `bin` format.
  249. Args:
  250. op_name (str): Operator name without scope.
  251. tensor_type (str): The tensor type of the operator, `input` or `output`.
  252. idx (int): Tensor slot index.
  253. tensor (TensorProto): Tensor data in proto format.
  254. dump_data_array (numpy.array): Tensor data in numpy format.
  255. Returns:
  256. str, output tensor file name.
  257. """
  258. out_file_name = "%s.%s.%d.%s.%s.bin" % (
  259. op_name,
  260. tensor_type,
  261. idx,
  262. self.hisi_utils.get_string_from_list(dump_data_array.shape, 'x'),
  263. self.hisi_common.get_format_string(tensor.format),
  264. )
  265. out_path = os.path.join(self.args_parser.output_path, out_file_name)
  266. dump_data_array.tofile(out_path)
  267. class FileMapping:
  268. """Mapping op pattern to files."""
  269. def __init__(self, data_loader):
  270. self.data_loader = data_loader
  271. self.output_path = Path(data_loader.get_net_dir()).absolute()
  272. def find_tensor_file(self, pattern, device_ids=None, iterations=None):
  273. """
  274. Find tensor files.
  275. Args:
  276. pattern (str): File name pattern.
  277. device_ids (Union[None, list[int]]): Filter condition of device id. Default: None.
  278. iterations (Union[None, list[int]]): Filter condition of iteration id. Default: None.
  279. Returns:
  280. dict, file paths.
  281. """
  282. op_path = OpPathManager(pattern)
  283. if device_ids is None:
  284. device_dirs = self.output_path.glob('device_[0-9]*')
  285. else:
  286. device_dirs = []
  287. for device_id in device_ids:
  288. device_dirs.append(self.output_path.joinpath(f'device_{device_id}'))
  289. for device_dir in device_dirs:
  290. op_device_obj = self.find_tensor_file_per_device(pattern, device_dir, iterations)
  291. op_path.add(op_device_obj)
  292. return op_path
  293. def find_tensor_file_per_device(self, pattern, device_dir, iterations):
  294. """
  295. Find tensor files per device directory.
  296. Args:
  297. pattern (str): File name pattern.
  298. device_dir (Union[Path, str]): Directory path.
  299. iterations (Union[None, list[int]]): Filter condition of iteration id. Default: None.
  300. Returns:
  301. OpDevicePath, operator file path object of one device.
  302. """
  303. device_dir = Path(device_dir)
  304. # device_name is like `device_{device_id}`
  305. device_id = int(device_dir.name.split('_')[-1])
  306. op_device_obj = OpDevicePath(device_id)
  307. def _find_by_iter_dirs(dirs):
  308. for iter_dir in dirs:
  309. op_path_per_iter = self.find_tensor_file_per_iter(pattern, iter_dir)
  310. op_device_obj.add(op_path_per_iter)
  311. if iterations is None:
  312. iter_dirs = device_dir.glob('*_graph_[0-9]*/[0-9]*/[0-9]*')
  313. _find_by_iter_dirs(iter_dirs)
  314. else:
  315. for iteration in iterations:
  316. iter_dirs = device_dir.glob(f'*_graph_[0-9]*/[0-9]*/{iteration}')
  317. _find_by_iter_dirs(iter_dirs)
  318. return op_device_obj
  319. @staticmethod
  320. def find_tensor_file_per_iter(pattern, iter_dir):
  321. """
  322. Find tensor files per iteration directory.
  323. Args:
  324. pattern (str): File name pattern.
  325. iter_dir (Union[Path, str]): Iteration path.
  326. Returns:
  327. OpPath, the operator file path object of one iteration.
  328. """
  329. dir_path = Path(iter_dir)
  330. def _get_file_generator(tensor_type):
  331. return dir_path.glob(f'*{pattern}.*{tensor_type}.[0-9]*.npy')
  332. in_gen = _get_file_generator('input')
  333. out_gen = _get_file_generator('output')
  334. iteration = int(dir_path.name)
  335. op_path_obj = OpPath(iteration, in_gen, out_gen)
  336. return op_path_obj
  337. class OpPathManager:
  338. """The manager of tensor files of one operator."""
  339. def __init__(self, pattern, op_full_name=None):
  340. self.pattern = pattern
  341. self.op_full_name = op_full_name
  342. self._op_path = {}
  343. @property
  344. def devices(self):
  345. """Get list of iterations in cache."""
  346. return list(self._op_path.keys())
  347. def add(self, op_device_path):
  348. """Add OpDevicePath object."""
  349. self._op_path[op_device_path.device_id] = op_device_path
  350. def device(self, device_id):
  351. """Get OpDevicePath object according to device id."""
  352. return self._op_path.get(device_id)
  353. def to_dict(self):
  354. """Get operator files of all devices in dict format."""
  355. res = {}
  356. for device_id, op_path in self._op_path.items():
  357. key = f'device_{device_id}'
  358. res[key] = op_path.to_dict()
  359. return res
  360. class OpDevicePath:
  361. """The operator file object of specific device."""
  362. def __init__(self, device_id):
  363. self._device_id = device_id
  364. # record the operation path object of different iteration
  365. # the format is like <int, OpPath>
  366. self._op_path = {}
  367. @property
  368. def device_id(self):
  369. """The property of device id."""
  370. return self._device_id
  371. @property
  372. def iterations(self):
  373. """Get list of iterations in cache."""
  374. return list(self._op_path.keys())
  375. def iteration(self, iteration):
  376. """Get the op path object according to iteration."""
  377. return self._op_path.get(iteration)
  378. def add(self, op_path):
  379. """Add OpPath object."""
  380. self._op_path[op_path.iteration] = op_path
  381. def to_dict(self):
  382. """Get operator files of one device in dict format."""
  383. res = {}
  384. for iteration, op_path in self._op_path.items():
  385. res[iteration] = op_path.to_dict()
  386. return res
  387. class OpPath:
  388. """The operator file object of specific iteration."""
  389. def __init__(self, iteration, input_gen, output_gen):
  390. self._iter = iteration
  391. self._input_files = None
  392. self._input_gen = input_gen
  393. self._output_files = None
  394. self._output_gen = output_gen
  395. @staticmethod
  396. def _convert_path_gen_to_list(path_gen):
  397. """Convert generator of Path.glob to list of string."""
  398. return [str(path) for path in path_gen]
  399. @property
  400. def input(self):
  401. """The list of input tensor files."""
  402. if self._input_files is None:
  403. self._input_files = self._convert_path_gen_to_list(self._input_gen)
  404. return self._input_files
  405. @property
  406. def output(self):
  407. """The list of output tensor file paths."""
  408. if self._output_files is None:
  409. self._output_files = self._convert_path_gen_to_list(self._output_gen)
  410. return self._output_files
  411. @property
  412. def iteration(self):
  413. """The iteration of the tensor file."""
  414. return self._iter
  415. def to_dict(self):
  416. """Get operator files of one iteration in dict format."""
  417. res = {
  418. 'input': self.input,
  419. 'output': self.output
  420. }
  421. return res