| | |
| | import logging |
| | import os |
| | import os.path as osp |
| | from collections import OrderedDict |
| | from pathlib import Path |
| | from typing import Dict, Optional, Sequence, Union |
| |
|
| | import numpy as np |
| | import torch |
| |
|
| | from mmengine.fileio import FileClient, dump |
| | from mmengine.fileio.io import get_file_backend |
| | from mmengine.hooks import Hook |
| | from mmengine.logging import print_log |
| | from mmengine.registry import HOOKS |
| | from mmengine.utils import is_seq_of, scandir |
| |
|
| | DATA_BATCH = Optional[Union[dict, tuple, list]] |
| | SUFFIX_TYPE = Union[Sequence[str], str] |
| |
|
| |
|
| | @HOOKS.register_module() |
| | class LoggerHook(Hook): |
| | """Collect logs from different components of ``Runner`` and write them to |
| | terminal, JSON file, tensorboard and wandb .etc. |
| | |
| | ``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during |
| | training/validation/testing phase. It is used to control following |
| | behaviors: |
| | |
| | - The frequency of logs update in terminal, local, tensorboad wandb.etc. |
| | - The frequency of show experiment information in terminal. |
| | - The work directory to save logs. |
| | |
| | Args: |
| | interval (int): Logging interval (every k iterations). |
| | Defaults to 10. |
| | ignore_last (bool): Ignore the log of last iterations in each epoch if |
| | the number of remaining iterations is less than :attr:`interval`. |
| | Defaults to True. |
| | interval_exp_name (int): Logging interval for experiment name. This |
| | feature is to help users conveniently get the experiment |
| | information from screen or log file. Defaults to 1000. |
| | out_dir (str or Path, optional): The root directory to save |
| | checkpoints. If not specified, ``runner.work_dir`` will be used |
| | by default. If specified, the ``out_dir`` will be the concatenation |
| | of ``out_dir`` and the last level directory of ``runner.work_dir``. |
| | For example, if the input ``out_dir`` is ``./tmp`` and |
| | ``runner.work_dir`` is ``./work_dir/cur_exp``, then the log will be |
| | saved in ``./tmp/cur_exp``. Defaults to None. |
| | out_suffix (Tuple[str] or str): Those files in ``runner._log_dir`` |
| | ending with ``out_suffix`` will be copied to ``out_dir``. Defaults |
| | to ('json', '.log', '.py'). |
| | keep_local (bool): Whether to keep local logs in the local machine |
| | when :attr:`out_dir` is specified. If False, the local log will be |
| | removed. Defaults to True. |
| | file_client_args (dict, optional): Arguments to instantiate a |
| | FileClient. See :class:`mmengine.fileio.FileClient` for details. |
| | Defaults to None. It will be deprecated in future. Please use |
| | `backend_args` instead. |
| | log_metric_by_epoch (bool): Whether to output metric in validation step |
| | by epoch. It can be true when running in epoch based runner. |
| | If set to True, `after_val_epoch` will set `step` to self.epoch in |
| | `runner.visualizer.add_scalars`. Otherwise `step` will be |
| | self.iter. Defaults to True. |
| | backend_args (dict, optional): Arguments to instantiate the |
| | prefix of uri corresponding backend. Defaults to None. |
| | New in v0.2.0. |
| | |
| | Examples: |
| | >>> # The simplest LoggerHook config. |
| | >>> logger_hook_cfg = dict(interval=20) |
| | """ |
| | priority = 'BELOW_NORMAL' |
| |
|
| | def __init__(self, |
| | interval: int = 10, |
| | ignore_last: bool = True, |
| | interval_exp_name: int = 1000, |
| | out_dir: Optional[Union[str, Path]] = None, |
| | out_suffix: SUFFIX_TYPE = ('.json', '.log', '.py', 'yaml'), |
| | keep_local: bool = True, |
| | file_client_args: Optional[dict] = None, |
| | log_metric_by_epoch: bool = True, |
| | backend_args: Optional[dict] = None): |
| |
|
| | if not isinstance(interval, int): |
| | raise TypeError('interval must be an integer') |
| | if interval <= 0: |
| | raise ValueError('interval must be greater than 0') |
| |
|
| | if not isinstance(ignore_last, bool): |
| | raise TypeError('ignore_last must be a boolean') |
| |
|
| | if not isinstance(interval_exp_name, int): |
| | raise TypeError('interval_exp_name must be an integer') |
| | if interval_exp_name <= 0: |
| | raise ValueError('interval_exp_name must be greater than 0') |
| |
|
| | if out_dir is not None and not isinstance(out_dir, (str, Path)): |
| | raise TypeError('out_dir must be a str or Path object') |
| |
|
| | if not isinstance(keep_local, bool): |
| | raise TypeError('keep_local must be a boolean') |
| |
|
| | if out_dir is None and file_client_args is not None: |
| | raise ValueError( |
| | 'file_client_args should be "None" when `out_dir` is not' |
| | 'specified.') |
| |
|
| | if file_client_args is not None: |
| | print_log( |
| | '"file_client_args" will be deprecated in future. ' |
| | 'Please use "backend_args" instead', |
| | logger='current', |
| | level=logging.WARNING) |
| | if backend_args is not None: |
| | raise ValueError( |
| | '"file_client_args" and "backend_args" cannot be set ' |
| | 'at the same time.') |
| |
|
| | if not (isinstance(out_suffix, str) or is_seq_of(out_suffix, str)): |
| | raise TypeError('out_suffix should be a string or a sequence of ' |
| | f'string, but got {type(out_suffix)}') |
| |
|
| | self.out_suffix = out_suffix |
| | self.out_dir = out_dir |
| | self.interval = interval |
| | self.ignore_last = ignore_last |
| | self.interval_exp_name = interval_exp_name |
| | self.keep_local = keep_local |
| | self.file_client_args = file_client_args |
| | self.json_log_path: Optional[str] = None |
| |
|
| | if self.out_dir is not None: |
| | self.file_client = FileClient.infer_client(file_client_args, |
| | self.out_dir) |
| | if file_client_args is None: |
| | self.file_backend = get_file_backend( |
| | self.out_dir, backend_args=backend_args) |
| | else: |
| | self.file_backend = self.file_client |
| |
|
| | self.log_metric_by_epoch = log_metric_by_epoch |
| |
|
| | def before_run(self, runner) -> None: |
| | """Infer ``self.file_client`` from ``self.out_dir``. Initialize the |
| | ``self.start_iter`` and record the meta information. |
| | |
| | Args: |
| | runner (Runner): The runner of the training process. |
| | """ |
| | if self.out_dir is not None: |
| | |
| | |
| | basename = osp.basename(runner.work_dir.rstrip(osp.sep)) |
| | self.out_dir = self.file_backend.join_path(self.out_dir, basename) |
| | runner.logger.info( |
| | f'Text logs will be saved to {self.out_dir} after the ' |
| | 'training process.') |
| |
|
| | self.json_log_path = f'{runner.timestamp}.json' |
| |
|
| | def after_train_iter(self, |
| | runner, |
| | batch_idx: int, |
| | data_batch: DATA_BATCH = None, |
| | outputs: Optional[dict] = None) -> None: |
| | """Record logs after training iteration. |
| | |
| | Args: |
| | runner (Runner): The runner of the training process. |
| | batch_idx (int): The index of the current batch in the train loop. |
| | data_batch (dict tuple or list, optional): Data from dataloader. |
| | outputs (dict, optional): Outputs from model. |
| | """ |
| | |
| | if self.every_n_train_iters( |
| | runner, self.interval_exp_name) or (self.end_of_epoch( |
| | runner.train_dataloader, batch_idx)): |
| | exp_info = f'Exp name: {runner.experiment_name}' |
| | runner.logger.info(exp_info) |
| | if self.every_n_inner_iters(batch_idx, self.interval): |
| | tag, log_str = runner.log_processor.get_log_after_iter( |
| | runner, batch_idx, 'train') |
| | elif (self.end_of_epoch(runner.train_dataloader, batch_idx) |
| | and (not self.ignore_last |
| | or len(runner.train_dataloader) <= self.interval)): |
| | |
| | |
| | |
| | |
| | tag, log_str = runner.log_processor.get_log_after_iter( |
| | runner, batch_idx, 'train') |
| | else: |
| | return |
| | runner.logger.info(log_str) |
| | runner.visualizer.add_scalars( |
| | tag, step=runner.iter + 1, file_path=self.json_log_path) |
| |
|
| | def after_val_iter(self, |
| | runner, |
| | batch_idx: int, |
| | data_batch: DATA_BATCH = None, |
| | outputs: Optional[Sequence] = None) -> None: |
| | """Record logs after validation iteration. |
| | |
| | Args: |
| | runner (Runner): The runner of the validation process. |
| | batch_idx (int): The index of the current batch in the validation |
| | loop. |
| | data_batch (dict or tuple or list, optional): Data from dataloader. |
| | Defaults to None. |
| | outputs (sequence, optional): Outputs from model. |
| | """ |
| | if self.every_n_inner_iters(batch_idx, self.interval): |
| | _, log_str = runner.log_processor.get_log_after_iter( |
| | runner, batch_idx, 'val') |
| | runner.logger.info(log_str) |
| |
|
| | def after_test_iter(self, |
| | runner, |
| | batch_idx: int, |
| | data_batch: DATA_BATCH = None, |
| | outputs: Optional[Sequence] = None) -> None: |
| | """Record logs after testing iteration. |
| | |
| | Args: |
| | runner (Runner): The runner of the testing process. |
| | batch_idx (int): The index of the current batch in the test loop. |
| | data_batch (dict or tuple or list, optional): Data from dataloader. |
| | outputs (sequence, optional): Outputs from model. |
| | """ |
| | if self.every_n_inner_iters(batch_idx, self.interval): |
| | _, log_str = runner.log_processor.get_log_after_iter( |
| | runner, batch_idx, 'test') |
| | runner.logger.info(log_str) |
| |
|
| | def after_val_epoch(self, |
| | runner, |
| | metrics: Optional[Dict[str, float]] = None) -> None: |
| | """All subclasses should override this method, if they need any |
| | operations after each validation epoch. |
| | |
| | Args: |
| | runner (Runner): The runner of the validation process. |
| | metrics (Dict[str, float], optional): Evaluation results of all |
| | metrics on validation dataset. The keys are the names of the |
| | metrics, and the values are corresponding results. |
| | """ |
| | tag, log_str = runner.log_processor.get_log_after_epoch( |
| | runner, len(runner.val_dataloader), 'val') |
| | runner.logger.info(log_str) |
| | if self.log_metric_by_epoch: |
| | |
| | |
| | |
| | |
| | if (isinstance(runner._train_loop, dict) |
| | or runner._train_loop is None): |
| | epoch = 0 |
| | else: |
| | epoch = runner.epoch |
| | runner.visualizer.add_scalars( |
| | tag, step=epoch, file_path=self.json_log_path) |
| | else: |
| | if (isinstance(runner._train_loop, dict) |
| | or runner._train_loop is None): |
| | iter = 0 |
| | else: |
| | iter = runner.iter |
| | runner.visualizer.add_scalars( |
| | tag, step=iter, file_path=self.json_log_path) |
| |
|
| | def after_test_epoch(self, |
| | runner, |
| | metrics: Optional[Dict[str, float]] = None) -> None: |
| | """All subclasses should override this method, if they need any |
| | operations after each test epoch. |
| | |
| | Args: |
| | runner (Runner): The runner of the testing process. |
| | metrics (Dict[str, float], optional): Evaluation results of all |
| | metrics on test dataset. The keys are the names of the |
| | metrics, and the values are corresponding results. |
| | """ |
| | tag, log_str = runner.log_processor.get_log_after_epoch( |
| | runner, len(runner.test_dataloader), 'test', with_non_scalar=True) |
| | runner.logger.info(log_str) |
| | dump( |
| | self._process_tags(tag), |
| | osp.join(runner.log_dir, self.json_log_path)) |
| |
|
| | @staticmethod |
| | def _process_tags(tags: dict): |
| | """Convert tag values to json-friendly type.""" |
| |
|
| | def process_val(value): |
| | if isinstance(value, (list, tuple)): |
| | |
| | return [process_val(item) for item in value] |
| | elif isinstance(value, dict): |
| | |
| | return {k: process_val(v) for k, v in value.items()} |
| | elif isinstance(value, (str, int, float, bool)) or value is None: |
| | |
| | return value |
| | elif isinstance(value, (torch.Tensor, np.ndarray)): |
| | return value.tolist() |
| | |
| |
|
| | processed_tags = OrderedDict(process_val(tags)) |
| |
|
| | return processed_tags |
| |
|
| | def after_run(self, runner) -> None: |
| | """Copy logs to ``self.out_dir`` if ``self.out_dir is not None`` |
| | |
| | Args: |
| | runner (Runner): The runner of the training/testing/validation |
| | process. |
| | """ |
| | |
| | runner.visualizer.close() |
| |
|
| | |
| | if self.out_dir is None: |
| | return |
| |
|
| | removed_files = [] |
| | for filename in scandir(runner._log_dir, self.out_suffix, True): |
| | local_filepath = osp.join(runner._log_dir, filename) |
| | removed_files.append(local_filepath) |
| | out_filepath = self.file_backend.join_path(self.out_dir, filename) |
| | with open(local_filepath) as f: |
| | self.file_backend.put_text(f.read(), out_filepath) |
| |
|
| | runner.logger.info( |
| | f'The file {local_filepath} has been uploaded to ' |
| | f'{out_filepath}.') |
| |
|
| | if not self.keep_local: |
| | runner.logger.info(f'{local_filepath} was removed due to the ' |
| | '`self.keep_local=False`. You can check ' |
| | f'the running logs in {out_filepath}') |
| |
|
| | if not self.keep_local: |
| | |
| | for handler in runner.logger.handlers: |
| | if isinstance(handler, logging.FileHandler): |
| | handler.close() |
| |
|
| | for file in removed_files: |
| | os.remove(file) |
| |
|