| | import os |
| | import logging |
| | import time |
| | from collections import namedtuple |
| | from pathlib import Path |
| |
|
| | import torch |
| | import torch.optim as optim |
| | import torch.nn as nn |
| | import numpy as np |
| | from torch.utils.data import DataLoader |
| | from prefetch_generator import BackgroundGenerator |
| | from contextlib import contextmanager |
| | import re |
| |
|
| | def clean_str(s): |
| | |
| | return re.sub(pattern="[|@#!¡·$€%&()=?¿^*;:,¨´><+]", repl="_", string=s) |
| |
|
| | def create_logger(cfg, cfg_path, phase='train', rank=-1): |
| | |
| | dataset = cfg.DATASET.DATASET |
| | dataset = dataset.replace(':', '_') |
| | model = cfg.MODEL.NAME |
| | cfg_path = os.path.basename(cfg_path).split('.')[0] |
| |
|
| | if rank in [-1, 0]: |
| | time_str = time.strftime('%Y-%m-%d-%H-%M') |
| | log_file = '{}_{}_{}.log'.format(cfg_path, time_str, phase) |
| | |
| | tensorboard_log_dir = Path(cfg.LOG_DIR) / dataset / model / \ |
| | (cfg_path + '_' + time_str) |
| | final_output_dir = tensorboard_log_dir |
| | if not tensorboard_log_dir.exists(): |
| | print('=> creating {}'.format(tensorboard_log_dir)) |
| | tensorboard_log_dir.mkdir(parents=True) |
| |
|
| | final_log_file = tensorboard_log_dir / log_file |
| | head = '%(asctime)-15s %(message)s' |
| | logging.basicConfig(filename=str(final_log_file), |
| | format=head) |
| | logger = logging.getLogger() |
| | logger.setLevel(logging.INFO) |
| | console = logging.StreamHandler() |
| | logging.getLogger('').addHandler(console) |
| |
|
| | return logger, str(final_output_dir), str(tensorboard_log_dir) |
| | else: |
| | return None, None, None |
| |
|
| |
|
| | def select_device(logger, device='', batch_size=None): |
| | |
| | cpu_request = device.lower() == 'cpu' |
| | if device and not cpu_request: |
| | os.environ['CUDA_VISIBLE_DEVICES'] = device |
| | assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device |
| |
|
| | cuda = False if cpu_request else torch.cuda.is_available() |
| | if cuda: |
| | c = 1024 ** 2 |
| | ng = torch.cuda.device_count() |
| | if ng > 1 and batch_size: |
| | assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng) |
| | x = [torch.cuda.get_device_properties(i) for i in range(ng)] |
| | s = f'Using torch {torch.__version__} ' |
| | for i in range(0, ng): |
| | if i == 1: |
| | s = ' ' * len(s) |
| | if logger: |
| | logger.info("%sCUDA:%g (%s, %dMB)" % (s, i, x[i].name, x[i].total_memory / c)) |
| | else: |
| | logger.info(f'Using torch {torch.__version__} CPU') |
| |
|
| | if logger: |
| | logger.info('') |
| | return torch.device('cuda:0' if cuda else 'cpu') |
| |
|
| |
|
| | def get_optimizer(cfg, model): |
| | optimizer = None |
| | if cfg.TRAIN.OPTIMIZER == 'sgd': |
| | optimizer = optim.SGD( |
| | filter(lambda p: p.requires_grad, model.parameters()), |
| | lr=cfg.TRAIN.LR0, |
| | momentum=cfg.TRAIN.MOMENTUM, |
| | weight_decay=cfg.TRAIN.WD, |
| | nesterov=cfg.TRAIN.NESTEROV |
| | ) |
| | elif cfg.TRAIN.OPTIMIZER == 'adam': |
| | optimizer = optim.Adam( |
| | filter(lambda p: p.requires_grad, model.parameters()), |
| | |
| | lr=cfg.TRAIN.LR0, |
| | betas=(cfg.TRAIN.MOMENTUM, 0.999) |
| | ) |
| |
|
| | return optimizer |
| |
|
| |
|
| | def save_checkpoint(epoch, name, model, optimizer, output_dir, filename, is_best=False): |
| | model_state = model.module.state_dict() if is_parallel(model) else model.state_dict() |
| | checkpoint = { |
| | 'epoch': epoch, |
| | 'model': name, |
| | 'state_dict': model_state, |
| | |
| | |
| | 'optimizer': optimizer.state_dict(), |
| | } |
| | torch.save(checkpoint, os.path.join(output_dir, filename)) |
| | if is_best and 'state_dict' in checkpoint: |
| | torch.save(checkpoint['best_state_dict'], |
| | os.path.join(output_dir, 'model_best.pth')) |
| |
|
| |
|
| | def initialize_weights(model): |
| | for m in model.modules(): |
| | t = type(m) |
| | if t is nn.Conv2d: |
| | pass |
| | elif t is nn.BatchNorm2d: |
| | m.eps = 1e-3 |
| | m.momentum = 0.03 |
| | elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]: |
| | |
| | m.inplace = True |
| |
|
| |
|
| | def xyxy2xywh(x): |
| | |
| | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) |
| | y[:, 0] = (x[:, 0] + x[:, 2]) / 2 |
| | y[:, 1] = (x[:, 1] + x[:, 3]) / 2 |
| | y[:, 2] = x[:, 2] - x[:, 0] |
| | y[:, 3] = x[:, 3] - x[:, 1] |
| | return y |
| |
|
| |
|
| | def is_parallel(model): |
| | return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) |
| |
|
| |
|
| | def time_synchronized(): |
| | torch.cuda.synchronize() if torch.cuda.is_available() else None |
| | return time.time() |
| |
|
| |
|
| | class DataLoaderX(DataLoader): |
| | """prefetch dataloader""" |
| | def __iter__(self): |
| | return BackgroundGenerator(super().__iter__()) |
| |
|
| | @contextmanager |
| | def torch_distributed_zero_first(local_rank: int): |
| | """ |
| | Decorator to make all processes in distributed training wait for each local_master to do something. |
| | """ |
| | if local_rank not in [-1, 0]: |
| | torch.distributed.barrier() |
| | yield |
| | if local_rank == 0: |
| | torch.distributed.barrier() |
| |
|