| |
| import logging |
| import os |
| import os.path as osp |
| import sys |
| from typing import Callable, Optional, Union |
|
|
| import torch |
|
|
| from mmengine.dist import master_only |
| from mmengine.hooks import Hook |
| from mmengine.logging import print_log |
| from mmengine.registry import HOOKS |
|
|
|
|
| def check_kineto() -> bool: |
| kineto_exist = False |
| try: |
| if torch.autograd.kineto_available(): |
| kineto_exist = True |
| except AttributeError: |
| print_log('NO KINETO', logger='current', level=logging.WARNING) |
| return kineto_exist |
|
|
|
|
| @HOOKS.register_module() |
| class ProfilerHook(Hook): |
| """A hook to analyze performance during training and inference. |
| |
| PyTorch Profiler is a tool that allows the collection of the performance |
| metrics during the training. More details on Profiler can be found at |
| `official docs <https://pytorch.org/docs/stable/profiler.html |
| #torch.profiler.profile>`_ |
| |
| Args: |
| by_epoch (bool): Profile performance by epoch or by iteration. |
| Defaults to True. |
| profile_times (int): The period (epoch/iter) recorded by the profiler. |
| Defaults to 1. For example, profile_iters=10 and by_epoch=False, |
| indicate that 0-10 iterations are recorded. |
| activity_with_cpu (bool): Activities to be used in the analysis (CPU) |
| activity_with_cuda (bool): Activities to be used in the analysis (CUDA) |
| schedule (dict, optional): Key-word arguments passed to |
| `torch.profile.schedule <https://pytorch.org/docs/stable/ |
| profiler.html#torch.profiler.schedule>`_. |
| Defaults to None, which means profiling without a schedule |
| on_trace_ready (callable, dict, optional): Either a handler or a dict |
| of generating handler. Defaults to None, which means profiling |
| without an on_trace_ready.The Callable type needs to construct its |
| own function that can handle 'torch.autograd.profiler.profile'. |
| Two officially recommended ways are provided: |
| |
| - ``schedule=dict(type='log_trace')``: Print the profiling result |
| in the terminal. See more details in the `PyTorch official tutorial`_. |
| The configurable arguments are the same as |
| ``prof.key_averages().table`` |
| - ``scheduler=dict(type='tb_trace')``: Profile the performance |
| with tensorboard. See more details in the tutorial |
| `profile with tensorboard`_. |
| |
| record_shapes (bool): Save information about operator's input shapes. |
| Defaults to False. |
| profile_memory (bool): Track tensor memory allocation/deallocation. |
| Defaults to False. |
| with_stack (bool): Record source information (file and line number) |
| for the ops. Defaults to False. |
| with_flops (bool): Use formula to estimate the FLOPS of specific |
| operators (matrix multiplication and 2D convolution). |
| Defaults to False. |
| json_trace_path (str, optional): Exports the collected trace in Chrome |
| JSON format. Chrome use 'chrome://tracing' view json file. |
| Defaults to None, which means profiling does not store json files. |
| |
| Warnings: |
| The profiler will be closed after ``profile_times`` iterations |
| automatically. Please make sure the configuration of your scheduler |
| will not close the profiler before the iteration reach the value of |
| ``profile_times`` |
| |
| Examples: |
| >>> # tensorboard trace |
| >>> trace_config = dict(type='tb_trace') |
| >>> profiler_hook_cfg = dict(on_trace_ready=trace_config) |
| |
| .. _PyTorch official tutorial: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#using-profiler-to-analyze-execution-time |
| .. _profile with tensorboard: https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard |
| """ |
| priority = 'VERY_LOW' |
|
|
| def __init__(self, |
| *, |
| by_epoch: bool = True, |
| profile_times: int = 1, |
| activity_with_cpu: bool = True, |
| activity_with_cuda: bool = False, |
| schedule: Optional[dict] = None, |
| on_trace_ready: Union[Callable, dict, None] = None, |
| record_shapes: bool = False, |
| profile_memory: bool = False, |
| with_stack: bool = False, |
| with_flops: bool = False, |
| json_trace_path: Optional[str] = None) -> None: |
|
|
| try: |
| from torch import profiler |
| except ImportError: |
| raise ImportError('please upgrade torch above 1.8.1') |
| if not check_kineto(): |
| raise ImportError('Due to Kineto support issues, please upgrade ' |
| 'pytorch above 1.8.1(windows users above 1.9.1)') |
|
|
| assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.' |
| self.by_epoch = by_epoch |
|
|
| if profile_times < 1: |
| raise ValueError('profile_iters should be greater than 0, ' |
| f'but got {profile_times}') |
| if by_epoch and profile_times > 1: |
| raise ValueError( |
| f'Profiler will profile 0-{profile_times} epochs.\n' |
| 'Since profiler will slow down the training, it is recommended' |
| ' to train 1 epoch with ProfilerHook and adjust your setting ' |
| 'according to the profiler summary.\n' |
| 'During normal training(epoch > 1), ' |
| 'you may disable the ProfilerHook.') |
| self.profile_times = profile_times |
|
|
| assert isinstance(activity_with_cpu, bool), \ |
| '``activity_with_cpu`` should be a boolean.' |
| assert isinstance(activity_with_cuda, bool), \ |
| '``activity_with_cuda`` should be a boolean.' |
| self.activities = [] |
| if activity_with_cpu: |
| self.activities.append(profiler.ProfilerActivity.CPU) |
| if activity_with_cuda: |
| self.activities.append(profiler.ProfilerActivity.CUDA) |
|
|
| if schedule is not None: |
| assert isinstance(schedule, dict), '``schedule`` should be a dict.' |
| self.schedule = profiler.schedule(**schedule) |
| else: |
| self.schedule = None |
|
|
| self.on_trace_ready = on_trace_ready |
| self.record_shapes = record_shapes |
| self.profile_memory = profile_memory |
| self.with_stack = with_stack |
| self.with_flops = with_flops |
|
|
| self.json_trace_path = json_trace_path |
| self._closed = False |
|
|
| def before_run(self, runner): |
| """Initialize the profiler. |
| |
| Through the runner parameter, the validity of the parameter is further |
| determined. |
| """ |
| max_times = runner.max_epochs if self.by_epoch else runner.max_iters |
| if max_times < self.profile_times: |
| raise ValueError( |
| f'``profile_times`` should not be greater than {max_times}') |
|
|
| on_trace_ready = self._parse_trace_config(runner) |
|
|
| self.profiler = torch.profiler.profile( |
| activities=self.activities, |
| schedule=self.schedule, |
| on_trace_ready=on_trace_ready, |
| record_shapes=self.record_shapes, |
| profile_memory=self.profile_memory, |
| with_stack=self.with_stack, |
| with_flops=self.with_flops) |
|
|
| self.profiler.__enter__() |
| runner.logger.info('profiler is profiling...') |
|
|
| def _parse_trace_config(self, runner): |
| """Used to parse the parameter 'on_trace_ready'.""" |
| if self.on_trace_ready is None: |
| _on_trace_ready = None |
| elif callable(self.on_trace_ready): |
| _on_trace_ready = self.on_trace_ready |
| elif isinstance(self.on_trace_ready, dict): |
| trace_cfg = self.on_trace_ready.copy() |
| trace_type = trace_cfg.pop('type') |
|
|
| |
| if trace_type == 'log_trace': |
|
|
| def _log_handler(_profile): |
| print(_profile.key_averages().table(**trace_cfg)) |
|
|
| _on_trace_ready = _log_handler |
|
|
| elif trace_type == 'tb_trace': |
| try: |
| import torch_tb_profiler |
| except ImportError: |
| raise ImportError( |
| 'please run ``pip install torch-tb-profiler``') |
|
|
| if 'dir_name' not in trace_cfg: |
| trace_cfg['dir_name'] = osp.join(runner.log_dir, |
| 'tf_tracing_logs') |
| elif not osp.isabs(trace_cfg['dir_name']): |
| trace_cfg['dir_name'] = osp.join(runner.log_dir, |
| trace_cfg['dir_name']) |
| runner.logger.info('trace_files of ProfilerHook will be ' |
| f'saved to {trace_cfg["dir_name"]}.') |
|
|
| if self.json_trace_path is not None: |
| runner.logger.warn( |
| 'When using tensorboard_trace, it is recommended to ' |
| 'save json files by setting ``worker_name`` instead of' |
| ' setting ``json_trace_path``') |
| _on_trace_ready = torch.profiler.tensorboard_trace_handler( |
| **trace_cfg) |
| else: |
| raise ValueError('trace_type should be "log_trace" or ' |
| f'"tb_trace", but got {trace_type}') |
| else: |
| raise ValueError( |
| '``on_trace_ready`` should be a handler, or dict, or None, ' |
| f'but got {self.on_trace_ready}') |
| return _on_trace_ready |
|
|
| def after_train_epoch(self, runner): |
| """Determine if the content is exported.""" |
| |
| |
| if not self._closed: |
| self._export_chrome_trace(runner) |
|
|
| def after_train_iter(self, runner, batch_idx, data_batch, outputs): |
| """profiler will call `step` method if it is not closed.""" |
| if not self._closed: |
| self.profiler.step() |
| if runner.iter == self.profile_times - 1 and not self.by_epoch: |
| self._export_chrome_trace(runner) |
|
|
| def _export_chrome_trace(self, runner): |
| """Exporting content.""" |
| self._closed = True |
| runner.logger.info('profiler may take a few minutes...') |
| self.profiler.__exit__(None, None, None) |
| if self.json_trace_path is not None: |
| self.profiler.export_chrome_trace(self.json_trace_path) |
|
|
|
|
| @HOOKS.register_module() |
| class NPUProfilerHook(Hook): |
| """NPUProfiler to analyze performance during training. |
| |
| NPU Profiling is used to count the device execution time of all operators. |
| The torch_npu.npu.profile interface is used to complete the profiling data |
| collection at each stage of the project, and the data is analyzed by the |
| msprof tool and the data can be dumped to further manually analyze the |
| key performance bottlenecks. For more details on the torch_npu.npu.profile |
| interface, please visit |
| https://gitee.com/ascend/pytorch/blob/master/torch_npu/npu/profiler.py#profile |
| |
| Args: |
| begin (int): Number of start iterations for profiling. Defaults to 0. |
| end (int): Number of end iterations for profiling. Defaults to 1. |
| result_path (str): The path to save the profiling results file. |
| Defaults to 'cann_profiling'. |
| exit_after_profiling (bool): Whether to exit the program after |
| profiling. Defaults to True. |
| use_e2e_profiler (bool): Turn on E2E profiling, E2E profiling combines |
| performance data at the Pytorch level and the NPU level to analyze |
| the bottlenecks of model performance end-to-end, and cannot show |
| detailed content, and only as an auxiliary analysis. |
| Defaults to False. |
| ge_profiling_to_std_out (bool): Turn on GE profiling, GE uses to |
| collect the profiling data of the host side scheduling of the |
| Assend device. Defaults to False. |
| |
| Examples: |
| >>> cfg = ... |
| >>> profiler_config = dict(type='NPUProfilerHook', end=2) |
| >>> cfg.merge_from_dict({'custom_hooks': custom_hooks}) |
| >>> runner = Runner.from_cfg(cfg) |
| >>> runner.train() |
| """ |
| priority = 'VERY_LOW' |
|
|
| def __init__(self, |
| *, |
| begin: int = 0, |
| end: int = 1, |
| result_path: str = 'cann_profiling', |
| exit_after_profiling: bool = True, |
| use_e2e_profiler: bool = False, |
| ge_profiling_to_std_out: bool = False): |
|
|
| try: |
| import torch_npu |
| except ImportError: |
| raise ImportError('Failed to import torch_npu module') |
|
|
| if begin >= end: |
| raise ValueError( |
| 'The iteration to start profiling should not be greater' |
| 'than or equal to profile end') |
|
|
| self.begin = begin |
| self.end = end |
| self.result_path = result_path |
| self.exit_after_profiling = exit_after_profiling |
|
|
| if ge_profiling_to_std_out: |
| os.environ['GE_PROFILING_TO_STD_OUT'] = '1' |
|
|
| if not osp.exists(self.result_path): |
| os.makedirs(self.result_path, exist_ok=True) |
|
|
| self.profiler = torch_npu.npu.profile( |
| self.result_path, use_e2e_profiler=use_e2e_profiler) |
|
|
| @master_only |
| def before_run(self, runner): |
|
|
| if self.end > runner.max_iters: |
| raise ValueError( |
| 'The profiling end iteration should not be greater' |
| 'than the max iteration') |
|
|
| @master_only |
| def before_train_iter(self, runner, batch_idx, data_batch=None): |
|
|
| if runner.iter == self.begin: |
| self.profiler.__enter__() |
| runner.logger.info('NPUProfiler starts profiling...') |
|
|
| @master_only |
| def after_train_iter(self, |
| runner, |
| batch_idx, |
| data_batch=None, |
| outputs=None): |
|
|
| if runner.iter == self.end - 1: |
| runner.logger.info('profiler may take a few minutes to' |
| ' save the profiling result.') |
| self.profiler.__exit__(None, None, None) |
| if self.exit_after_profiling: |
| sys.exit() |
|
|