diffusiondet-dota / modeling_diffusiondet.py

Update modeling_diffusiondet.py

c1e9524 verified 6 months ago

17.6 kB

	import math
	import random
	from collections import namedtuple, OrderedDict
	from dataclasses import dataclass
	from typing import Dict, List, Optional, Tuple, Union

	import torch
	from torch import nn
	import torch.nn.functional as F
	from torchvision import ops
	from torchvision.ops.feature_pyramid_network import FeaturePyramidNetwork
	from transformers import PreTrainedModel
	import wandb

	from transformers.utils.backbone_utils import load_backbone
	from .configuration_diffusiondet import DiffusionDetConfig

	from .head import HeadDynamicK
	from .loss import CriterionDynamicK

	from transformers.utils import ModelOutput

	ModelPrediction = namedtuple('ModelPrediction', ['pred_noise', 'pred_x_start'])


	def default(val, d):
	if val is not None:
	return val
	return d() if callable(d) else d


	def extract(a, t, x_shape):
	"""extract the appropriate t index for a batch of indices"""
	batch_size = t.shape[0]
	out = a.gather(-1, t)
	return out.reshape(batch_size, ((1,) (len(x_shape) - 1)))


	def cosine_beta_schedule(timesteps, s=0.008):
	"""
	cosine schedule
	as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
	"""
	steps = timesteps + 1
	x = torch.linspace(0, timesteps, steps)
	alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
	alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
	betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
	return torch.clip(betas, 0, 0.999)

	@dataclass
	class DiffusionDetOutput(ModelOutput):
	"""
	Output type of DiffusionDet.
	"""

	loss: Optional[torch.FloatTensor] = None
	loss_dict: Optional[Dict] = None
	logits: torch.FloatTensor = None
	labels: torch.IntTensor = None
	pred_boxes: torch.FloatTensor = None

	class DiffusionDet(PreTrainedModel):
	"""
	Implement DiffusionDet
	"""
	config_class = DiffusionDetConfig
	main_input_name = "pixel_values"

	def __init__(self, config):
	super(DiffusionDet, self).__init__(config)

	self.in_features = config.roi_head_in_features
	self.num_classes = config.num_labels
	self.num_proposals = config.num_proposals
	self.num_heads = config.num_heads

	self.backbone = load_backbone(config)
	self.fpn = FeaturePyramidNetwork(
	in_channels_list=self.backbone.channels,
	out_channels=config.fpn_out_channels,
	# extra_blocks=LastLevelMaxPool(),
	)

	# build diffusion
	betas = cosine_beta_schedule(1000)
	alphas_cumprod = torch.cumprod(1 - betas, dim=0)

	timesteps, = betas.shape
	sampling_timesteps = config.sample_step

	self.register_buffer('alphas_cumprod', alphas_cumprod)
	self.register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
	self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
	self.register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
	self.register_buffer('sqrt_recipm1_alphas_cumprod', torch.sqrt(1. / alphas_cumprod - 1))

	self.num_timesteps = int(timesteps)
	self.sampling_timesteps = default(sampling_timesteps, timesteps)
	self.ddim_sampling_eta = 1.
	self.scale = config.snr_scale
	assert self.sampling_timesteps <= timesteps

	roi_input_shape = {
	'p2': {'stride': 4},
	'p3': {'stride': 8},
	'p4': {'stride': 16},
	'p5': {'stride': 32},
	'p6': {'stride': 64}
	}
	self.head = HeadDynamicK(config, roi_input_shape=roi_input_shape)

	self.deep_supervision = config.deep_supervision
	self.use_focal = config.use_focal
	self.use_fed_loss = config.use_fed_loss
	self.use_nms = config.use_nms

	weight_dict = {
	"loss_ce": config.class_weight, "loss_bbox": config.l1_weight, "loss_giou": config.giou_weight
	}
	if self.deep_supervision:
	aux_weight_dict = {}
	for i in range(self.num_heads - 1):
	aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
	weight_dict.update(aux_weight_dict)

	self.criterion = CriterionDynamicK(config, num_classes=self.num_classes, weight_dict=weight_dict)

	def _init_weights(self, module):
	if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
	torch.nn.init.kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
	if module.bias is not None:
	torch.nn.init.constant_(module.bias, 0)
	elif isinstance(module, nn.BatchNorm2d):
	torch.nn.init.constant_(module.weight, 1)
	torch.nn.init.constant_(module.bias, 0)

	def predict_noise_from_start(self, x_t, t, x0):
	return (
	(extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) /
	extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
	)

	def model_predictions(self, backbone_feats, images_whwh, x, t):
	x_boxes = torch.clamp(x, min=-1 * self.scale, max=self.scale)
	x_boxes = ((x_boxes / self.scale) + 1) / 2
	x_boxes = ops.box_convert(x_boxes, 'cxcywh', 'xyxy')
	x_boxes = x_boxes * images_whwh[:, None, :]
	outputs_class, outputs_coord = self.head(backbone_feats, x_boxes, t)

	x_start = outputs_coord[-1] # (batch, num_proposals, 4) predict boxes: absolute coordinates (x1, y1, x2, y2)
	x_start = x_start / images_whwh[:, None, :]
	x_start = ops.box_convert(x_start, 'xyxy', 'cxcywh')
	x_start = (x_start * 2 - 1.) * self.scale
	x_start = torch.clamp(x_start, min=-1 * self.scale, max=self.scale)
	pred_noise = self.predict_noise_from_start(x, t, x_start)

	return ModelPrediction(pred_noise, x_start), outputs_class, outputs_coord

	@torch.no_grad()
	def ddim_sample(self, batched_inputs, backbone_feats, images_whwh):
	bs = len(batched_inputs)
	image_sizes = batched_inputs.shape
	shape = (bs, self.num_proposals, 4)

	# [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
	times = torch.linspace(-1, self.num_timesteps - 1, steps=self.sampling_timesteps + 1)
	times = list(reversed(times.int().tolist()))
	time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]

	img = torch.randn(shape, device=self.device)

	ensemble_score, ensemble_label, ensemble_coord = [], [], []
	outputs_class, outputs_coord = None, None
	for time, time_next in time_pairs:
	time_cond = torch.full((bs,), time, device=self.device, dtype=torch.long)

	preds, outputs_class, outputs_coord = self.model_predictions(backbone_feats, images_whwh, img, time_cond)
	pred_noise, x_start = preds.pred_noise, preds.pred_x_start

	score_per_image, box_per_image = outputs_class[-1][0], outputs_coord[-1][0]
	threshold = 0.5
	score_per_image = torch.sigmoid(score_per_image)
	value, _ = torch.max(score_per_image, -1, keepdim=False)
	keep_idx = value > threshold
	num_remain = torch.sum(keep_idx)

	pred_noise = pred_noise[:, keep_idx, :]
	x_start = x_start[:, keep_idx, :]
	img = img[:, keep_idx, :]

	if time_next < 0:
	img = x_start
	continue

	alpha = self.alphas_cumprod[time]
	alpha_next = self.alphas_cumprod[time_next]

	sigma = self.ddim_sampling_eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
	c = (1 - alpha_next - sigma ** 2).sqrt()

	noise = torch.randn_like(img)

	img = x_start * alpha_next.sqrt() + \
	c * pred_noise + \
	sigma * noise

	img = torch.cat((img, torch.randn(1, self.num_proposals - num_remain, 4, device=img.device)), dim=1)

	if self.sampling_timesteps > 1:
	box_pred_per_image, scores_per_image, labels_per_image = self.inference(outputs_class[-1],
	outputs_coord[-1])
	ensemble_score.append(scores_per_image)
	ensemble_label.append(labels_per_image)
	ensemble_coord.append(box_pred_per_image)

	if self.sampling_timesteps > 1:
	box_pred_per_image = torch.cat(ensemble_coord, dim=0)
	scores_per_image = torch.cat(ensemble_score, dim=0)
	labels_per_image = torch.cat(ensemble_label, dim=0)

	if self.use_nms:
	keep = ops.batched_nms(box_pred_per_image, scores_per_image, labels_per_image, 0.5)
	box_pred_per_image = box_pred_per_image[keep]
	scores_per_image = scores_per_image[keep]
	labels_per_image = labels_per_image[keep]

	return box_pred_per_image, scores_per_image, labels_per_image
	else:
	return self.inference(outputs_class[-1], outputs_coord[-1])

	def q_sample(self, x_start, t, noise=None):
	if noise is None:
	noise = torch.randn_like(x_start)

	sqrt_alphas_cumprod_t = extract(self.sqrt_alphas_cumprod, t, x_start.shape)
	sqrt_one_minus_alphas_cumprod_t = extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)

	return sqrt_alphas_cumprod_t * x_start + sqrt_one_minus_alphas_cumprod_t * noise

	def forward(self, pixel_values, labels=None):
	"""
	Args:
	"""
	images = pixel_values.to(self.device)
	images_whwh = list()
	for image in images:
	h, w = image.shape[-2:]
	images_whwh.append(torch.tensor([w, h, w, h], device=self.device))
	images_whwh = torch.stack(images_whwh)

	features = self.backbone(images)
	features = OrderedDict(
	[(key, feature) for key, feature in zip(self.backbone.out_features, features.feature_maps)]
	)
	features = self.fpn(features) # [144, 72, 36, 18]
	features = [features[f] for f in features.keys()]

	if not self.training:
	pred_logits, pred_labels, pred_boxes = self.ddim_sample(pixel_values, features, images_whwh)
	return DiffusionDetOutput(
	# loss=loss_dict['loss'],
	# loss_dict=loss_dict,
	logits=pred_logits,
	labels=pred_labels,
	pred_boxes=pred_boxes,
	)

	# if self.training:
	labels = list(map(lambda tensor: tensor.to(self.device), labels))
	targets, x_boxes, noises, ts = self.prepare_targets(labels)

	ts = ts.squeeze(-1)
	x_boxes = x_boxes * images_whwh[:, None, :]

	outputs_class, outputs_coord = self.head(features, x_boxes, ts)
	output = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}

	if self.deep_supervision:
	output['aux_outputs'] = [{'pred_logits': a, 'pred_boxes': b}
	for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]

	loss_dict = self.criterion(output, targets)
	weight_dict = self.criterion.weight_dict
	for k in loss_dict.keys():
	if k in weight_dict:
	loss_dict[k] *= weight_dict[k]
	loss_dict['loss'] = sum([loss_dict[k] for k in weight_dict.keys()])

	return DiffusionDetOutput(
	loss=loss_dict['loss'],
	loss_dict=loss_dict,
	logits=output['pred_logits'],
	pred_boxes=output['pred_boxes']
	)

	def prepare_diffusion_concat(self, gt_boxes):
	"""
	:param gt_boxes: (cx, cy, w, h), normalized
	:param num_proposals:
	"""
	t = torch.randint(0, self.num_timesteps, (1,), device=self.device).long()
	noise = torch.randn(self.num_proposals, 4, device=self.device)

	num_gt = gt_boxes.shape[0]
	if not num_gt: # generate fake gt boxes if empty gt boxes
	gt_boxes = torch.as_tensor([[0.5, 0.5, 1., 1.]], dtype=torch.float, device=self.device)
	num_gt = 1

	if num_gt < self.num_proposals:
	box_placeholder = torch.randn(self.num_proposals - num_gt, 4,
	device=self.device) / 6. + 0.5 # 3sigma = 1/2 --> sigma: 1/6
	box_placeholder[:, 2:] = torch.clip(box_placeholder[:, 2:], min=1e-4)
	x_start = torch.cat((gt_boxes, box_placeholder), dim=0)
	elif num_gt > self.num_proposals:
	select_mask = [True] * self.num_proposals + [False] * (num_gt - self.num_proposals)
	random.shuffle(select_mask)
	x_start = gt_boxes[select_mask]
	else:
	x_start = gt_boxes

	x_start = (x_start * 2. - 1.) * self.scale

	# noise sample
	x = self.q_sample(x_start=x_start, t=t, noise=noise)

	x = torch.clamp(x, min=-1 * self.scale, max=self.scale)
	x = ((x / self.scale) + 1) / 2.

	diff_boxes = ops.box_convert(x, 'cxcywh', 'xyxy')

	return diff_boxes, noise, t

	def prepare_targets(self, targets):
	new_targets = []
	diffused_boxes = []
	noises = []
	ts = []
	for target in targets:
	h, w = target.size
	image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
	gt_classes = target.class_labels.to(self.device)
	gt_boxes = target.boxes.to(self.device)
	d_boxes, d_noise, d_t = self.prepare_diffusion_concat(gt_boxes)
	image_size_xyxy_tgt = image_size_xyxy.unsqueeze(0).repeat(len(gt_boxes), 1)
	gt_boxes = gt_boxes * image_size_xyxy
	gt_boxes = ops.box_convert(gt_boxes, 'cxcywh', 'xyxy')

	diffused_boxes.append(d_boxes)
	noises.append(d_noise)
	ts.append(d_t)
	new_targets.append({
	"labels": gt_classes,
	"boxes": target.boxes.to(self.device),
	"boxes_xyxy": gt_boxes,
	"image_size_xyxy": image_size_xyxy.to(self.device),
	"image_size_xyxy_tgt": image_size_xyxy_tgt.to(self.device),
	"area": ops.box_area(target.boxes.to(self.device)),
	})

	return new_targets, torch.stack(diffused_boxes), torch.stack(noises), torch.stack(ts)

	def inference(self, box_cls, box_pred):
	"""
	Arguments:
	box_cls (Tensor): tensor of shape (batch_size, num_proposals, K).
	The tensor predicts the classification probability for each proposal.
	box_pred (Tensor): tensors of shape (batch_size, num_proposals, 4).
	The tensor predicts 4-vector (x,y,w,h) box
	regression values for every proposal
	image_sizes (List[torch.Size]): the input image sizes

	Returns:
	results (List[Instances]): a list of #images elements.
	"""
	results = []
	boxes_output = []
	logits_output = []
	labels_output = []

	if self.use_focal or self.use_fed_loss:
	scores = torch.sigmoid(box_cls)
	labels = torch.arange(self.num_classes, device=self.device). \
	unsqueeze(0).repeat(self.num_proposals, 1).flatten(0, 1)

	for i, (scores_per_image, box_pred_per_image) in enumerate(zip(
	scores, box_pred
	)):
	scores_per_image, topk_indices = scores_per_image.flatten(0, 1).topk(self.num_proposals, sorted=False)
	labels_per_image = labels[topk_indices]
	box_pred_per_image = box_pred_per_image.view(-1, 1, 4).repeat(1, self.num_classes, 1).view(-1, 4)
	box_pred_per_image = box_pred_per_image[topk_indices]

	if self.sampling_timesteps > 1:
	return box_pred_per_image, scores_per_image, labels_per_image

	if self.use_nms:
	keep = ops.batched_nms(box_pred_per_image, scores_per_image, labels_per_image, 0.5)
	box_pred_per_image = box_pred_per_image[keep]
	scores_per_image = scores_per_image[keep]
	labels_per_image = labels_per_image[keep]

	boxes_output.append(box_pred_per_image)
	logits_output.append(scores_per_image)
	labels_output.append(labels_per_image)
	else:
	# For each box we assign the best class or the second best if the best on is `no_object`.
	scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)

	for i, (scores_per_image, labels_per_image, box_pred_per_image) in enumerate(zip(
	scores, labels, box_pred
	)):
	if self.sampling_timesteps > 1:
	return box_pred_per_image, scores_per_image, labels_per_image

	if self.use_nms:
	keep = ops.batched_nms(box_pred_per_image, scores_per_image, labels_per_image, 0.5)
	box_pred_per_image = box_pred_per_image[keep]
	scores_per_image = scores_per_image[keep]
	labels_per_image = labels_per_image[keep]

	boxes_output.append(box_pred_per_image)
	logits_output.append(scores_per_image)
	labels_output.append(labels_per_image)

	return boxes_output, logits_output, labels_output