rtmdet-tiny / modeling_rtmdet.py

feat: add original_size param to forward() — boxes auto-scaled to image space

54e05fc verified 10 days ago

72.9 kB

	from typing import List, Optional, Tuple, Union, Sequence, Dict
	from dataclasses import dataclass
	import inspect
	from functools import partial
	import warnings

	import math
	import torch
	import torchvision
	import torch.nn as nn
	from torch import Tensor
	import torch.nn.functional as F
	from torch.nn.modules.batchnorm import _BatchNorm, SyncBatchNorm

	from transformers.modeling_outputs import ModelOutput
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import logging

	from .configuration_rtmdet import RTMDetConfig


	logger = logging.get_logger(__name__)

	@dataclass
	class DetectionOutput(ModelOutput):
	"""
	Output type for object detection models.

	Args:
	boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
	Detection boxes in format [x1, y1, x2, y2]. Coordinates are in
	model-input space (640×640) by default, or in original image pixel
	space when ``original_size`` was passed to ``forward()``.
	scores (`torch.FloatTensor` of shape `(batch_size, num_boxes)`):
	Detection confidence scores.
	labels (`torch.LongTensor` of shape `(batch_size, num_boxes)`):
	Detection class indices.
	loss (`torch.FloatTensor`, optional):
	Loss value if training.
	"""

	boxes: torch.FloatTensor = None
	scores: torch.FloatTensor = None
	labels: torch.LongTensor = None
	loss: Optional[torch.FloatTensor] = None


	# Replace MODELS registry with direct class mappings
	ACTIVATION_LAYERS = {
	'ReLU': nn.ReLU,
	'LeakyReLU': nn.LeakyReLU,
	'PReLU': nn.PReLU,
	'SiLU': nn.SiLU,
	'Sigmoid': nn.Sigmoid,
	'Tanh': nn.Tanh,
	'GELU': nn.GELU,
	'Swish': nn.SiLU, # Swish is equivalent to SiLU
	'Hardsigmoid': nn.Hardsigmoid,
	'HSigmoid': nn.Hardsigmoid
	}

	# Simple Config Type replacement
	ConfigType = Dict
	OptConfigType = Optional[Dict]
	OptMultiConfig = Optional[Union[Dict, List[Dict]]]

	def build_activation_layer(cfg: Dict) -> nn.Module:
	"""Build activation layer.
	Args:
	cfg (dict): The activation layer config, which should contain:
	- type (str): Layer type.
	- layer args: Args needed to instantiate an activation layer.
	Returns:
	nn.Module: Created activation layer.
	"""
	if not isinstance(cfg, dict):
	raise TypeError('cfg must be a dict')
	if 'type' not in cfg:
	raise KeyError('the cfg dict must contain the key "type"')

	cfg_ = cfg.copy()
	layer_type = cfg_.pop('type')

	if layer_type not in ACTIVATION_LAYERS:
	raise KeyError(f'Unrecognized activation type {layer_type}')

	activation = ACTIVATION_LAYERS[layer_type]
	return activation(**cfg_)

	def kaiming_init(module,
	a=0,
	mode='fan_out',
	nonlinearity='relu',
	bias=0,
	distribution='normal'):
	assert distribution in ['uniform', 'normal']
	if hasattr(module, 'weight') and module.weight is not None:
	if distribution == 'uniform':
	nn.init.kaiming_uniform_(
	module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
	else:
	nn.init.kaiming_normal_(
	module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
	if hasattr(module, 'bias') and module.bias is not None:
	nn.init.constant_(module.bias, bias)

	def constant_init(module, val, bias=0):
	if hasattr(module, 'weight') and module.weight is not None:
	nn.init.constant_(module.weight, val)
	if hasattr(module, 'bias') and module.bias is not None:
	nn.init.constant_(module.bias, bias)

	class _InstanceNorm(nn.modules.instancenorm._InstanceNorm):
	"""Instance Normalization Base Class."""
	pass

	# Custom implementation of methods with asterisks that couldn't be included in the original code
	# These methods need to be renamed without asterisks in actual implementation

	def infer_abbr(class_type):
	"""Infer abbreviation from the class name."""
	if not inspect.isclass(class_type):
	raise TypeError(
	f'class_type must be a type, but got {type(class_type)}')
	if hasattr(class_type, '_abbr_'):
	return class_type._abbr_
	if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN
	return 'in'
	elif issubclass(class_type, _BatchNorm):
	return 'bn'
	elif issubclass(class_type, nn.GroupNorm):
	return 'gn'
	elif issubclass(class_type, nn.LayerNorm):
	return 'ln'
	else:
	class_name = class_type.__name__.lower()
	if 'batch' in class_name:
	return 'bn'
	elif 'group' in class_name:
	return 'gn'
	elif 'layer' in class_name:
	return 'ln'
	elif 'instance' in class_name:
	return 'in'
	else:
	return 'norm_layer'

	# Create mapping from strings to layer classes
	NORM_LAYERS = {
	'BN': nn.BatchNorm2d,
	'BN1d': nn.BatchNorm1d,
	'BN2d': nn.BatchNorm2d,
	'BN3d': nn.BatchNorm3d,
	'SyncBN': SyncBatchNorm,
	'GN': nn.GroupNorm,
	'LN': nn.LayerNorm,
	'IN': nn.InstanceNorm2d,
	'IN1d': nn.InstanceNorm1d,
	'IN2d': nn.InstanceNorm2d,
	'IN3d': nn.InstanceNorm3d
	}

	CONV_LAYERS = {
	'Conv1d': nn.Conv1d,
	'Conv2d': nn.Conv2d,
	'Conv3d': nn.Conv3d,
	'Conv': nn.Conv2d
	}

	PADDING_LAYERS = {
	'zero': nn.ZeroPad2d,
	'reflect': nn.ReflectionPad2d,
	'replicate': nn.ReplicationPad2d
	}

	def build_norm_layer(cfg: Dict,
	num_features: int,
	postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
	"""Build normalization layer."""
	if not isinstance(cfg, dict):
	raise TypeError('cfg must be a dict')
	if 'type' not in cfg:
	raise KeyError('the cfg dict must contain the key "type"')

	cfg_ = cfg.copy()
	layer_type = cfg_.pop('type')

	if layer_type not in NORM_LAYERS:
	raise KeyError(f'Unrecognized norm type {layer_type}')

	norm_layer = NORM_LAYERS[layer_type]
	abbr = infer_abbr(norm_layer)

	assert isinstance(postfix, (int, str))
	name = abbr + str(postfix)

	requires_grad = cfg_.pop('requires_grad', True)
	cfg_.setdefault('eps', 1e-5)

	if norm_layer is not nn.GroupNorm:
	layer = norm_layer(num_features, **cfg_)
	if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
	layer._specify_ddp_gpu_num(1)
	else:
	assert 'num_groups' in cfg_
	layer = norm_layer(num_channels=num_features, **cfg_)

	for param in layer.parameters():
	param.requires_grad = requires_grad

	return name, layer

	def build_conv_layer(cfg: Optional[Dict], args, *kwargs) -> nn.Module:
	"""Build convolution layer."""
	if cfg is None:
	cfg_ = dict(type='Conv2d')
	else:
	if not isinstance(cfg, dict):
	raise TypeError('cfg must be a dict')
	if 'type' not in cfg:
	raise KeyError('the cfg dict must contain the key "type"')
	cfg_ = cfg.copy()

	layer_type = cfg_.pop('type')

	if layer_type not in CONV_LAYERS:
	raise KeyError(f'Unrecognized conv type {layer_type}')

	conv_layer = CONV_LAYERS[layer_type]
	layer = conv_layer(args, kwargs, *cfg_)

	return layer

	def build_padding_layer(cfg: Dict, args, *kwargs) -> nn.Module:
	"""Build padding layer."""
	if not isinstance(cfg, dict):
	raise TypeError('cfg must be a dict')
	if 'type' not in cfg:
	raise KeyError('the cfg dict must contain the key "type"')

	cfg_ = cfg.copy()
	padding_type = cfg_.pop('type')

	if padding_type not in PADDING_LAYERS:
	raise KeyError(f'Unrecognized padding type {padding_type}')

	padding_layer = PADDING_LAYERS[padding_type]
	layer = padding_layer(args, kwargs, *cfg_)

	return layer

	def efficient_conv_bn_eval_forward(bn: _BatchNorm,
	conv: nn.modules.conv._ConvNd,
	x: torch.Tensor):
	"""
	Implementation based on https://arxiv.org/abs/2305.11624
	"Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
	It leverages the associative law between convolution and affine transform,
	i.e., normalize (weight conv feature) = (normalize weight) conv feature.
	It works for Eval mode of ConvBN blocks during validation, and can be used
	for training as well. It reduces memory and computation cost.
	Args:
	bn (_BatchNorm): a BatchNorm module.
	conv (nn._ConvNd): a conv module
	x (torch.Tensor): Input feature map.
	"""
	# These lines of code are designed to deal with various cases
	# like bn without affine transform, and conv without bias
	weight_on_the_fly = conv.weight
	if conv.bias is not None:
	bias_on_the_fly = conv.bias
	else:
	bias_on_the_fly = torch.zeros_like(bn.running_var)
	if bn.weight is not None:
	bn_weight = bn.weight
	else:
	bn_weight = torch.ones_like(bn.running_var)
	if bn.bias is not None:
	bn_bias = bn.bias
	else:
	bn_bias = torch.zeros_like(bn.running_var)
	# shape of [C_out, 1, 1, 1] in Conv2d
	weight_coeff = torch.rsqrt(bn.running_var +
	bn.eps).reshape([-1] + [1] *
	(len(conv.weight.shape) - 1))
	# shape of [C_out, 1, 1, 1] in Conv2d
	coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
	# shape of [C_out, C_in, k, k] in Conv2d
	weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
	# shape of [C_out] in Conv2d
	bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
	(bias_on_the_fly - bn.running_mean)
	return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)

	class ConvModule(nn.Module):
	"""A conv block that bundles conv/norm/activation layers."""
	_abbr_ = 'conv_block'

	def __init__(self,
	in_channels: int,
	out_channels: int,
	kernel_size: Union[int, Tuple[int, int]],
	stride: Union[int, Tuple[int, int]] = 1,
	padding: Union[int, Tuple[int, int]] = 0,
	dilation: Union[int, Tuple[int, int]] = 1,
	groups: int = 1,
	bias: Union[bool, str] = 'auto',
	conv_cfg: Optional[Dict] = None,
	norm_cfg: Optional[Dict] = None,
	act_cfg: Optional[Dict] = dict(type='ReLU'),
	inplace: bool = True,
	with_spectral_norm: bool = False,
	padding_mode: str = 'zeros',
	order: tuple = ('conv', 'norm', 'act'),
	efficient_conv_bn_eval: bool = False):
	super().__init__()
	assert conv_cfg is None or isinstance(conv_cfg, dict)
	assert norm_cfg is None or isinstance(norm_cfg, dict)
	assert act_cfg is None or isinstance(act_cfg, dict)
	official_padding_mode = ['zeros', 'circular']
	self.conv_cfg = conv_cfg
	self.norm_cfg = norm_cfg
	self.act_cfg = act_cfg
	self.inplace = inplace
	self.with_spectral_norm = with_spectral_norm
	self.with_explicit_padding = padding_mode not in official_padding_mode
	self.order = order
	assert isinstance(self.order, tuple) and len(self.order) == 3
	assert set(order) == {'conv', 'norm', 'act'}
	self.with_norm = norm_cfg is not None
	self.with_activation = act_cfg is not None
	# if the conv layer is before a norm layer, bias is unnecessary.
	if bias == 'auto':
	bias = not self.with_norm
	self.with_bias = bias

	if self.with_explicit_padding:
	pad_cfg = dict(type=padding_mode)
	self.padding_layer = build_padding_layer(pad_cfg, padding)

	# reset padding to 0 for conv module
	conv_padding = 0 if self.with_explicit_padding else padding

	# build convolution layer
	self.conv = build_conv_layer(
	conv_cfg,
	in_channels,
	out_channels,
	kernel_size,
	stride=stride,
	padding=conv_padding,
	dilation=dilation,
	groups=groups,
	bias=bias)

	# export the attributes of self.conv to a higher level for convenience
	self.in_channels = self.conv.in_channels
	self.out_channels = self.conv.out_channels
	self.kernel_size = self.conv.kernel_size
	self.stride = self.conv.stride
	self.padding = padding
	self.dilation = self.conv.dilation
	self.transposed = self.conv.transposed
	self.output_padding = self.conv.output_padding
	self.groups = self.conv.groups

	if self.with_spectral_norm:
	self.conv = nn.utils.spectral_norm(self.conv)

	# build normalization layers
	if self.with_norm:
	# norm layer is after conv layer
	if order.index('norm') > order.index('conv'):
	norm_channels = out_channels
	else:
	norm_channels = in_channels
	self.norm_name, norm = build_norm_layer(
	norm_cfg, norm_channels) # type: ignore
	self.add_module(self.norm_name, norm)
	if self.with_bias:
	if isinstance(norm, (_BatchNorm, _InstanceNorm)):
	warnings.warn(
	'Unnecessary conv bias before batch/instance norm')
	else:
	self.norm_name = None # type: ignore

	self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)

	# build activation layer
	if self.with_activation:
	act_cfg_ = act_cfg.copy() # type: ignore
	# nn.Tanh has no 'inplace' argument
	if act_cfg_['type'] not in [
	'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
	]:
	act_cfg_.setdefault('inplace', inplace)
	self.activate = build_activation_layer(act_cfg_)

	# Use msra init by default
	self.init_weights()

	@property
	def norm(self):
	if self.norm_name:
	return getattr(self, self.norm_name)
	else:
	return None

	def init_weights(self):
	if not hasattr(self.conv, 'init_weights'):
	if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
	nonlinearity = 'leaky_relu'
	a = self.act_cfg.get('negative_slope', 0.01)
	else:
	nonlinearity = 'relu'
	a = 0
	kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
	if self.with_norm:
	constant_init(self.norm, 1, bias=0)

	def forward(self,
	x: torch.Tensor,
	activate: bool = True,
	norm: bool = True) -> torch.Tensor:
	layer_index = 0
	while layer_index < len(self.order):
	layer = self.order[layer_index]
	if layer == 'conv':
	if self.with_explicit_padding:
	x = self.padding_layer(x)
	# if the next operation is norm and we have a norm layer in
	# eval mode and we have enabled `efficient_conv_bn_eval` for
	# the conv operator, then activate the optimized forward and
	# skip the next norm operator since it has been fused
	if layer_index + 1 < len(self.order) and \
	self.order[layer_index + 1] == 'norm' and norm and \
	self.with_norm and not self.norm.training and \
	self.efficient_conv_bn_eval_forward is not None:
	self.conv.forward = partial(
	self.efficient_conv_bn_eval_forward, self.norm,
	self.conv)
	layer_index += 1
	x = self.conv(x)
	del self.conv.forward
	else:
	x = self.conv(x)
	elif layer == 'norm' and norm and self.with_norm:
	x = self.norm(x)
	elif layer == 'act' and activate and self.with_activation:
	x = self.activate(x)
	layer_index += 1
	return x

	def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
	# efficient_conv_bn_eval works for conv + bn
	# with `track_running_stats` option
	if efficient_conv_bn_eval and self.norm \
	and isinstance(self.norm, _BatchNorm) \
	and self.norm.track_running_stats:
	self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward # noqa: E501
	else:
	self.efficient_conv_bn_eval_forward = None # type: ignore

	@staticmethod
	def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
	bn: torch.nn.modules.batchnorm._BatchNorm,
	efficient_conv_bn_eval=True) -> 'ConvModule':
	"""Create a ConvModule from a conv and a bn module."""
	self = ConvModule.__new__(ConvModule)
	super(ConvModule, self).__init__()
	self.conv_cfg = None
	self.norm_cfg = None
	self.act_cfg = None
	self.inplace = False
	self.with_spectral_norm = False
	self.with_explicit_padding = False
	self.order = ('conv', 'norm', 'act')
	self.with_norm = True
	self.with_activation = False
	self.with_bias = conv.bias is not None
	# build convolution layer
	self.conv = conv
	# export the attributes of self.conv to a higher level for convenience
	self.in_channels = self.conv.in_channels
	self.out_channels = self.conv.out_channels
	self.kernel_size = self.conv.kernel_size
	self.stride = self.conv.stride
	self.padding = self.conv.padding
	self.dilation = self.conv.dilation
	self.transposed = self.conv.transposed
	self.output_padding = self.conv.output_padding
	self.groups = self.conv.groups
	# build normalization layers
	self.norm_name, norm = 'bn', bn
	self.add_module(self.norm_name, norm)
	self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
	return self

	class DepthwiseSeparableConvModule(nn.Module):
	"""Depthwise separable convolution module."""
	def __init__(self,
	in_channels: int,
	out_channels: int,
	kernel_size: Union[int, Tuple[int, int]],
	stride: Union[int, Tuple[int, int]] = 1,
	padding: Union[int, Tuple[int, int]] = 0,
	dilation: Union[int, Tuple[int, int]] = 1,
	norm_cfg: Optional[Dict] = None,
	act_cfg: Dict = dict(type='ReLU'),
	dw_norm_cfg: Union[Dict, str] = 'default',
	dw_act_cfg: Union[Dict, str] = 'default',
	pw_norm_cfg: Union[Dict, str] = 'default',
	pw_act_cfg: Union[Dict, str] = 'default',
	**kwargs):
	super().__init__()
	assert 'groups' not in kwargs, 'groups should not be specified'
	# if norm/activation config of depthwise/pointwise ConvModule is not
	# specified, use default config.
	dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501
	dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
	pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501
	pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg

	# depthwise convolution
	self.depthwise_conv = ConvModule(
	in_channels,
	in_channels,
	kernel_size,
	stride=stride,
	padding=padding,
	dilation=dilation,
	groups=in_channels,
	norm_cfg=dw_norm_cfg, # type: ignore
	act_cfg=dw_act_cfg, # type: ignore
	**kwargs)

	self.pointwise_conv = ConvModule(
	in_channels,
	out_channels,
	1,
	norm_cfg=pw_norm_cfg, # type: ignore
	act_cfg=pw_act_cfg, # type: ignore
	**kwargs)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.depthwise_conv(x)
	x = self.pointwise_conv(x)
	return x

	class SPPBottleneck(nn.Module):
	"""Spatial pyramid pooling layer used in YOLOv3-SPP."""
	def __init__(self,
	in_channels,
	out_channels,
	kernel_sizes=(5, 9, 13),
	conv_cfg=None,
	norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
	act_cfg=dict(type='Swish'),
	init_cfg=None):
	super().__init__()
	mid_channels = in_channels // 2
	self.conv1 = ConvModule(
	in_channels,
	mid_channels,
	1,
	stride=1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	self.poolings = nn.ModuleList([
	nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
	for ks in kernel_sizes
	])
	conv2_channels = mid_channels * (len(kernel_sizes) + 1)
	self.conv2 = ConvModule(
	conv2_channels,
	out_channels,
	1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)

	def forward(self, x):
	x = self.conv1(x)
	with torch.amp.autocast(enabled=False, device_type=x.device.type):
	x = torch.cat(
	[x] + [pooling(x) for pooling in self.poolings], dim=1)
	x = self.conv2(x)
	return x

	class DarknetBottleneck(nn.Module):
	"""The basic bottleneck block used in Darknet."""
	def __init__(self,
	in_channels: int,
	out_channels: int,
	expansion: float = 0.5,
	add_identity: bool = True,
	use_depthwise: bool = False,
	conv_cfg: OptConfigType = None,
	norm_cfg: ConfigType = dict(
	type='BN', momentum=0.03, eps=0.001),
	act_cfg: ConfigType = dict(type='Swish'),
	init_cfg: OptMultiConfig = None) -> None:
	super().__init__()
	hidden_channels = int(out_channels * expansion)
	conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
	self.conv1 = ConvModule(
	in_channels,
	hidden_channels,
	1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	self.conv2 = conv(
	hidden_channels,
	out_channels,
	3,
	stride=1,
	padding=1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	self.add_identity = \
	add_identity and in_channels == out_channels

	def forward(self, x: Tensor) -> Tensor:
	"""Forward function."""
	identity = x
	out = self.conv1(x)
	out = self.conv2(out)
	if self.add_identity:
	return out + identity
	else:
	return out

	class CSPNeXtBlock(nn.Module):
	"""The basic bottleneck block used in CSPNeXt."""
	def __init__(self,
	in_channels: int,
	out_channels: int,
	expansion: float = 0.5,
	add_identity: bool = True,
	use_depthwise: bool = False,
	kernel_size: int = 5,
	conv_cfg: OptConfigType = None,
	norm_cfg: ConfigType = dict(
	type='BN', momentum=0.03, eps=0.001),
	act_cfg: ConfigType = dict(type='SiLU'),
	init_cfg: OptMultiConfig = None) -> None:
	super().__init__()
	hidden_channels = int(out_channels * expansion)
	conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
	self.conv1 = conv(
	in_channels,
	hidden_channels,
	3,
	stride=1,
	padding=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	self.conv2 = DepthwiseSeparableConvModule(
	hidden_channels,
	out_channels,
	kernel_size,
	stride=1,
	padding=kernel_size // 2,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	self.add_identity = \
	add_identity and in_channels == out_channels

	def forward(self, x: Tensor) -> Tensor:
	"""Forward function."""
	identity = x
	out = self.conv1(x)
	out = self.conv2(out)
	if self.add_identity:
	return out + identity
	else:
	return out

	class ChannelAttention(nn.Module):
	"""Channel attention Module."""
	def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None:
	super().__init__()
	self.global_avgpool = nn.AdaptiveAvgPool2d(1)
	self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
	self.act = nn.Hardsigmoid(inplace=True)

	def forward(self, x: Tensor) -> Tensor:
	"""Forward function for ChannelAttention."""
	with torch.amp.autocast(enabled=False, device_type=x.device.type):
	out = self.global_avgpool(x)
	out = self.fc(out)
	out = self.act(out)
	return x * out

	class CSPLayer(nn.Module):
	"""Cross Stage Partial Layer.
	Args:
	in_channels (int): The input channels of the CSP layer.
	out_channels (int): The output channels of the CSP layer.
	expand_ratio (float): Ratio to adjust the number of channels of the
	hidden layer. Defaults to 0.5.
	num_blocks (int): Number of blocks. Defaults to 1.
	add_identity (bool): Whether to add identity in blocks.
	Defaults to True.
	use_cspnext_block (bool): Whether to use CSPNeXt block.
	Defaults to False.
	use_depthwise (bool): Whether to use depthwise separable convolution in
	blocks. Defaults to False.
	channel_attention (bool): Whether to add channel attention in each
	stage. Defaults to True.
	conv_cfg (dict, optional): Config dict for convolution layer.
	Defaults to None, which means using conv2d.
	norm_cfg (dict): Config dict for normalization layer.
	Defaults to dict(type='BN')
	act_cfg (dict): Config dict for activation layer.
	Defaults to dict(type='Swish')
	"""
	def __init__(self,
	in_channels: int,
	out_channels: int,
	expand_ratio: float = 0.5,
	num_blocks: int = 1,
	add_identity: bool = True,
	use_depthwise: bool = False,
	use_cspnext_block: bool = False,
	channel_attention: bool = False,
	conv_cfg: OptConfigType = None,
	norm_cfg: ConfigType = dict(
	type='BN', momentum=0.03, eps=0.001),
	act_cfg: ConfigType = dict(type='Swish'),
	init_cfg: OptMultiConfig = None) -> None:
	super().__init__()
	block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
	mid_channels = int(out_channels * expand_ratio)
	self.channel_attention = channel_attention

	self.main_conv = ConvModule(
	in_channels,
	mid_channels,
	1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)

	self.short_conv = ConvModule(
	in_channels,
	mid_channels,
	1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)

	self.final_conv = ConvModule(
	2 * mid_channels,
	out_channels,
	1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)

	self.blocks = nn.Sequential(*[
	block(
	mid_channels,
	mid_channels,
	1.0,
	add_identity,
	use_depthwise,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg) for _ in range(num_blocks)
	])

	if channel_attention:
	self.attention = ChannelAttention(2 * mid_channels)

	def forward(self, x: Tensor) -> Tensor:
	"""Forward function."""
	x_short = self.short_conv(x)
	x_main = self.main_conv(x)
	x_main = self.blocks(x_main)
	x_final = torch.cat((x_main, x_short), dim=1)

	if self.channel_attention:
	x_final = self.attention(x_final)

	return self.final_conv(x_final)


	class CSPNeXt(nn.Module):
	"""CSPNeXt backbone used in RTMDet.
	This is a standalone implementation without requiring the mmdet registry.

	Args:
	arch (str): Architecture of CSPNeXt, from {P5, P6}.
	Defaults to P5.
	expand_ratio (float): Ratio to adjust the number of channels of the
	hidden layer. Defaults to 0.5.
	deepen_factor (float): Depth multiplier, multiply number of
	blocks in CSP layer by this amount. Defaults to 1.0.
	widen_factor (float): Width multiplier, multiply number of
	channels in each layer by this amount. Defaults to 1.0.
	out_indices (Sequence[int]): Output from which stages.
	Defaults to (2, 3, 4).
	frozen_stages (int): Stages to be frozen (stop grad and set eval
	mode). -1 means not freezing any parameters. Defaults to -1.
	use_depthwise (bool): Whether to use depthwise separable convolution.
	Defaults to False.
	arch_ovewrite (list): Overwrite default arch settings.
	Defaults to None.
	spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP
	layers. Defaults to (5, 9, 13).
	channel_attention (bool): Whether to add channel attention in each
	stage. Defaults to True.
	conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
	convolution layer. Defaults to None.
	norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
	config norm layer. Defaults to dict(type='BN', requires_grad=True).
	act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
	Defaults to dict(type='SiLU').
	norm_eval (bool): Whether to set norm layers to eval mode, namely,
	freeze running stats (mean and var). Note: Effect on Batch Norm
	and its variants only.
	"""

	# From left to right:
	# in_channels, out_channels, num_blocks, add_identity, use_spp
	arch_settings = {
	'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
	[256, 512, 6, True, False], [512, 1024, 3, False, True]],
	'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
	[256, 512, 6, True, False], [512, 768, 3, True, False],
	[768, 1024, 3, False, True]]
	}

	def __init__(
	self,
	arch: str = 'P5',
	deepen_factor: float = 1.0,
	widen_factor: float = 1.0,
	out_indices: Sequence[int] = (2, 3, 4),
	frozen_stages: int = -1,
	use_depthwise: bool = False,
	expand_ratio: float = 0.5,
	arch_ovewrite: dict = None,
	spp_kernel_sizes: Sequence[int] = (5, 9, 13),
	channel_attention: bool = True,
	conv_cfg: OptConfigType = None,
	norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
	act_cfg: ConfigType = dict(type='SiLU'),
	norm_eval: bool = False,
	init_cfg: OptMultiConfig = dict(
	type='Kaiming',
	layer='Conv2d',
	a=math.sqrt(5),
	distribution='uniform',
	mode='fan_in',
	nonlinearity='leaky_relu')
	) -> None:
	super().__init__()
	arch_setting = self.arch_settings[arch]
	if arch_ovewrite:
	arch_setting = arch_ovewrite
	assert set(out_indices).issubset(
	i for i in range(len(arch_setting) + 1))
	if frozen_stages not in range(-1, len(arch_setting) + 1):
	raise ValueError('frozen_stages must be in range(-1, '
	'len(arch_setting) + 1). But received '
	f'{frozen_stages}')

	self.out_indices = out_indices
	self.frozen_stages = frozen_stages
	self.use_depthwise = use_depthwise
	self.norm_eval = norm_eval

	conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule

	self.stem = nn.Sequential(
	ConvModule(
	3,
	int(arch_setting[0][0] * widen_factor // 2),
	3,
	padding=1,
	stride=2,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg),
	ConvModule(
	int(arch_setting[0][0] * widen_factor // 2),
	int(arch_setting[0][0] * widen_factor // 2),
	3,
	padding=1,
	stride=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg),
	ConvModule(
	int(arch_setting[0][0] * widen_factor // 2),
	int(arch_setting[0][0] * widen_factor),
	3,
	padding=1,
	stride=1,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))

	self.layers = ['stem']

	for i, (in_channels, out_channels, num_blocks, add_identity,
	use_spp) in enumerate(arch_setting):
	in_channels = int(in_channels * widen_factor)
	out_channels = int(out_channels * widen_factor)
	num_blocks = max(round(num_blocks * deepen_factor), 1)
	stage = []

	conv_layer = conv(
	in_channels,
	out_channels,
	3,
	stride=2,
	padding=1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	stage.append(conv_layer)

	if use_spp:
	spp = SPPBottleneck(
	out_channels,
	out_channels,
	kernel_sizes=spp_kernel_sizes,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	stage.append(spp)

	csp_layer = CSPLayer(
	out_channels,
	out_channels,
	num_blocks=num_blocks,
	add_identity=add_identity,
	use_depthwise=use_depthwise,
	use_cspnext_block=True,
	expand_ratio=expand_ratio,
	channel_attention=channel_attention,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg)
	stage.append(csp_layer)

	self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
	self.layers.append(f'stage{i + 1}')

	def freeze_stages(self) -> None:
	"""Freeze stages parameters."""
	if self.frozen_stages >= 0:
	for i in range(self.frozen_stages + 1):
	m = getattr(self, self.layers[i])
	m.eval()
	for param in m.parameters():
	param.requires_grad = False

	def train(self, mode=True) -> None:
	"""Convert the model into training mode while keeping normalization layer
	frozen."""
	super().train(mode)
	self.freeze_stages()
	if mode and self.norm_eval:
	for m in self.modules():
	if isinstance(m, _BatchNorm):
	m.eval()

	def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
	outs = []
	for i, layer_name in enumerate(self.layers):
	layer = getattr(self, layer_name)
	x = layer(x)
	if i in self.out_indices:
	outs.append(x)
	return tuple(outs)


	class CSPNeXtPAFPN(nn.Module):
	"""Path Aggregation Network with CSPNeXt blocks.
	This is a standalone implementation that works with the CSPNeXt backbone.

	Args:
	in_channels (Sequence[int]): Number of input channels per scale.
	out_channels (int): Number of output channels (used at each scale)
	out_indices (Sequence[int]): Output from which stages.
	num_csp_blocks (int): Number of bottlenecks in CSPLayer.
	Defaults to 3.
	use_depthwise (bool): Whether to use depthwise separable convolution in
	blocks. Defaults to False.
	expand_ratio (float): Ratio to adjust the number of channels of the
	hidden layer. Default: 0.5
	upsample_cfg (dict): Config dict for interpolate layer.
	Default: `dict(scale_factor=2, mode='nearest')`
	conv_cfg (dict, optional): Config dict for convolution layer.
	Default: None, which means using conv2d.
	norm_cfg (dict): Config dict for normalization layer.
	Default: dict(type='BN')
	act_cfg (dict): Config dict for activation layer.
	Default: dict(type='Swish')
	"""

	def __init__(
	self,
	in_channels: Sequence[int],
	out_channels: int,
	out_indices=(0, 1, 2),
	num_csp_blocks: int = 3,
	use_depthwise: bool = False,
	expand_ratio: float = 0.5,
	upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
	conv_cfg: OptConfigType = None,
	norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
	act_cfg: ConfigType = dict(type='Swish'),
	init_cfg: OptMultiConfig = dict(
	type='Kaiming',
	layer='Conv2d',
	a=math.sqrt(5),
	distribution='uniform',
	mode='fan_in',
	nonlinearity='leaky_relu')
	) -> None:
	super().__init__()
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.out_indices = out_indices

	conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule

	# build top-down blocks
	self.upsample = nn.Upsample(**upsample_cfg)
	self.reduce_layers = nn.ModuleList()
	self.top_down_blocks = nn.ModuleList()
	for idx in range(len(in_channels) - 1, 0, -1):
	self.reduce_layers.append(
	ConvModule(
	in_channels[idx],
	in_channels[idx - 1],
	1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))
	self.top_down_blocks.append(
	CSPLayer(
	in_channels[idx - 1] * 2,
	in_channels[idx - 1],
	num_blocks=num_csp_blocks,
	add_identity=False,
	use_depthwise=use_depthwise,
	use_cspnext_block=True,
	expand_ratio=expand_ratio,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))

	# build bottom-up blocks
	self.downsamples = nn.ModuleList()
	self.bottom_up_blocks = nn.ModuleList()
	for idx in range(len(in_channels) - 1):
	self.downsamples.append(
	conv(
	in_channels[idx],
	in_channels[idx],
	3,
	stride=2,
	padding=1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))
	self.bottom_up_blocks.append(
	CSPLayer(
	in_channels[idx] * 2,
	in_channels[idx + 1],
	num_blocks=num_csp_blocks,
	add_identity=False,
	use_depthwise=use_depthwise,
	use_cspnext_block=True,
	expand_ratio=expand_ratio,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))

	if self.out_channels is not None:
	self.out_convs = nn.ModuleList()
	for i in range(len(in_channels)):
	self.out_convs.append(
	conv(
	in_channels[i],
	out_channels,
	3,
	padding=1,
	conv_cfg=conv_cfg,
	norm_cfg=norm_cfg,
	act_cfg=act_cfg))

	def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
	"""
	Args:
	inputs (tuple[Tensor]): input features.

	Returns:
	tuple[Tensor]: YOLOXPAFPN features.
	"""
	assert len(inputs) == len(self.in_channels)

	# top-down path
	inner_outs = [inputs[-1]]
	for idx in range(len(self.in_channels) - 1, 0, -1):
	feat_high = inner_outs[0]
	feat_low = inputs[idx - 1]
	feat_high = self.reduce_layers[len(self.in_channels) - 1 - idx](
	feat_high)
	inner_outs[0] = feat_high

	upsample_feat = self.upsample(feat_high)

	inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
	torch.cat([upsample_feat, feat_low], 1))
	inner_outs.insert(0, inner_out)

	# bottom-up path
	outs = [inner_outs[0]]
	for idx in range(len(self.in_channels) - 1):
	feat_low = outs[-1]
	feat_high = inner_outs[idx + 1]
	downsample_feat = self.downsamples[idx](feat_low)
	out = self.bottom_up_blocks[idx](
	torch.cat([downsample_feat, feat_high], 1))
	outs.append(out)

	if self.out_channels is not None:
	# out convs
	for idx in range(len(outs)):
	outs[idx] = self.out_convs[idx](outs[idx])

	return tuple([outs[i] for i in self.out_indices])


	class MlvlPointGenerator:
	"""Standard points generator for multi-level feature maps."""

	def __init__(
	self,
	strides,
	offset: float = 0.5
	) -> None:
	if not isinstance(strides, (list, tuple)):
	strides = [strides]

	self.strides = strides
	self.offset = offset

	def grid_priors(
	self,
	featmap_sizes,
	dtype=torch.float32,
	device='cuda',
	with_stride=False
	):
	"""Generate grid points of multiple feature levels."""
	num_levels = len(featmap_sizes)
	multi_level_priors = []

	for i in range(num_levels):
	priors = self.single_level_grid_priors(
	featmap_sizes[i],
	level_idx=i,
	dtype=dtype,
	device=device,
	with_stride=with_stride)
	multi_level_priors.append(priors)

	return multi_level_priors

	def single_level_grid_priors(
	self,
	featmap_size,
	level_idx,
	dtype=torch.float32,
	device='cuda',
	with_stride=False
	):
	"""Generate grid points for a single feature level."""
	feat_h, feat_w = featmap_size
	stride = self.strides[level_idx]

	# Create grid coordinates
	shift_x = (torch.arange(0, feat_w, device=device) + self.offset) * stride
	shift_y = (torch.arange(0, feat_h, device=device) + self.offset) * stride

	shift_x = shift_x.to(dtype)
	shift_y = shift_y.to(dtype)

	# Create grid
	shift_yy, shift_xx = torch.meshgrid(shift_y, shift_x, indexing="ij")
	shift_xx = shift_xx.reshape(-1)
	shift_yy = shift_yy.reshape(-1)

	if not with_stride:
	shifts = torch.stack([shift_xx, shift_yy], dim=-1)
	else:
	# Include stride information
	stride_tensor = torch.tensor(stride, dtype=dtype, device=device)
	stride_xx = torch.full_like(shift_xx, stride_tensor)
	stride_yy = torch.full_like(shift_yy, stride_tensor)
	shifts = torch.stack([shift_xx, shift_yy, stride_xx, stride_yy], dim=-1)

	return shifts


	# Helper functions needed for geometric mean sigmoid
	def sigmoid_geometric_mean(x, y):
	"""Compute geometric mean of two sigmoid functions."""
	x_sigmoid = torch.sigmoid(x)
	y_sigmoid = torch.sigmoid(y)
	return torch.sqrt(x_sigmoid * y_sigmoid)


	def inverse_sigmoid(x, eps=1e-5):
	"""Inverse function of sigmoid."""
	x = x.clamp(min=0, max=1)
	x1 = x.clamp(min=eps)
	x2 = (1 - x).clamp(min=eps)
	return torch.log(x1 / x2)


	class RTMDetSepBNHead(nn.Module):
	"""RTMDetHead with separated BN layers and shared conv layers."""

	def __init__(
	self,
	num_classes: int,
	in_channels: int,
	share_conv: bool = True,
	use_depthwise: bool = False,
	pred_kernel_size: int = 1,
	stacked_convs: int = 2,
	feat_channels: int = 256,
	strides: List[int] = [8, 16, 32],
	with_objectness: bool = False,
	exp_on_reg: bool = False,
	) -> None:
	super().__init__()
	self.num_classes = num_classes
	self.cls_out_channels = num_classes # For sigmoid
	self.in_channels = in_channels
	self.feat_channels = feat_channels
	self.stacked_convs = stacked_convs
	self.share_conv = share_conv
	self.use_depthwise = use_depthwise
	self.pred_kernel_size = pred_kernel_size
	self.with_objectness = with_objectness
	self.exp_on_reg = exp_on_reg
	self.strides = strides

	# Number of anchors per grid point
	self.num_base_priors = 1

	self._init_layers()

	def _init_layers(self) -> None:
	"""Initialize layers of the head."""
	self.cls_convs = nn.ModuleList()
	self.reg_convs = nn.ModuleList()

	self.rtm_cls = nn.ModuleList()
	self.rtm_reg = nn.ModuleList()
	if self.with_objectness:
	self.rtm_obj = nn.ModuleList()

	for n in range(len(self.strides)):
	cls_convs = nn.ModuleList()
	reg_convs = nn.ModuleList()
	for i in range(self.stacked_convs):
	chn = self.in_channels if i == 0 else self.feat_channels

	if self.use_depthwise:
	cls_conv = DepthwiseSeparableConvModule(
	chn,
	self.feat_channels,
	3,
	stride=1,
	padding=1,
	bias=False,
	act_cfg=dict(type='SiLU'),
	norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)
	)
	reg_conv = DepthwiseSeparableConvModule(
	chn,
	self.feat_channels,
	3,
	stride=1,
	padding=1,
	bias=False,
	act_cfg=dict(type='SiLU'),
	norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)
	)
	else:
	cls_conv = ConvModule(
	chn,
	self.feat_channels,
	3,
	stride=1,
	padding=1,
	bias=False,
	act_cfg=dict(type='SiLU'),
	norm_cfg=dict(type='BN', momentum=0.03, eps=0.001))
	reg_conv = ConvModule(
	chn,
	self.feat_channels,
	3,
	stride=1,
	padding=1,
	bias=False,
	act_cfg=dict(type='SiLU'),
	norm_cfg=dict(type='BN', momentum=0.03, eps=0.001))
	# Append conv layers to the list
	cls_convs.append(cls_conv)
	reg_convs.append(reg_conv)

	self.cls_convs.append(cls_convs)
	self.reg_convs.append(reg_convs)

	self.rtm_cls.append(
	nn.Conv2d(
	self.feat_channels,
	self.num_base_priors * self.cls_out_channels,
	self.pred_kernel_size,
	padding=self.pred_kernel_size // 2))
	self.rtm_reg.append(
	nn.Conv2d(
	self.feat_channels,
	self.num_base_priors * 4,
	self.pred_kernel_size,
	padding=self.pred_kernel_size // 2))
	if self.with_objectness:
	self.rtm_obj.append(
	nn.Conv2d(
	self.feat_channels,
	1,
	self.pred_kernel_size,
	padding=self.pred_kernel_size // 2))

	if self.share_conv:
	for n in range(1, len(self.strides)):
	for i in range(self.stacked_convs):
	self.cls_convs[n][i] = self.cls_convs[0][i]
	self.reg_convs[n][i] = self.reg_convs[0][i]

	# Initialize MlvlPointGenerator for anchor-free detection
	self.prior_generator = MlvlPointGenerator(self.strides, offset=0.0)

	def init_weights(self):
	"""Initialize weights of the head."""
	# Initialize conv layers with normal distribution
	for m in self.modules():
	if isinstance(m, nn.Conv2d):
	nn.init.normal_(m.weight, mean=0, std=0.01)
	if m.bias is not None:
	nn.init.constant_(m.bias, 0)
	if isinstance(m, nn.BatchNorm2d):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	# Initialize classification layers with a prior probability
	bias_init = -torch.log(torch.tensor((1 - 0.01) / 0.01))
	for rtm_cls in self.rtm_cls:
	nn.init.normal_(rtm_cls.weight, mean=0, std=0.01)
	nn.init.constant_(rtm_cls.bias, bias_init)

	for rtm_reg in self.rtm_reg:
	nn.init.normal_(rtm_reg.weight, mean=0, std=0.01)
	nn.init.constant_(rtm_reg.bias, 0)

	if self.with_objectness:
	for rtm_obj in self.rtm_obj:
	nn.init.normal_(rtm_obj.weight, mean=0, std=0.01)
	nn.init.constant_(rtm_obj.bias, bias_init)

	def forward(self, feats):
	"""Forward features from the upstream network.

	Args:
	feats (tuple[Tensor]): Features from the upstream network, each is
	a 4D-tensor.

	Returns:
	tuple: Usually a tuple of classification scores and bbox prediction
	- cls_scores (list[Tensor]): Classification scores for all scale
	levels, each is a 4D-tensor.
	- bbox_preds (list[Tensor]): Box energies / deltas for all scale
	levels, each is a 4D-tensor.
	"""
	cls_scores = []
	bbox_preds = []
	for idx, (x, stride) in enumerate(
	zip(feats, self.strides)):
	cls_feat = x
	reg_feat = x

	for cls_layer in self.cls_convs[idx]:
	cls_feat = cls_layer(cls_feat)
	cls_score = self.rtm_cls[idx](cls_feat)

	for reg_layer in self.reg_convs[idx]:
	reg_feat = reg_layer(reg_feat)

	if self.with_objectness:
	objectness = self.rtm_obj[idx](reg_feat)
	cls_score = inverse_sigmoid(
	sigmoid_geometric_mean(cls_score, objectness))

	if self.exp_on_reg:
	# Convert anchor-free to distance prediction, with stride scale
	reg_dist = self.rtm_reg[idx](reg_feat).exp() * stride
	else:
	reg_dist = self.rtm_reg[idx](reg_feat) * stride

	cls_scores.append(cls_score)
	bbox_preds.append(reg_dist)

	return tuple(cls_scores), tuple(bbox_preds)

	def predict(self, cls_scores, bbox_preds, batch_img_metas=None, cfg=None,
	rescale=False, with_nms=True, score_thr=0.05,
	nms_iou_threshold=0.6, max_per_img=100):
	"""Transform network outputs into bbox predictions.

	This is a simplified version for inference only.
	"""
	assert len(cls_scores) == len(bbox_preds)
	num_levels = len(cls_scores)
	device = cls_scores[0].device
	batch_size = cls_scores[0].shape[0]

	# If no image metadata is provided, create default ones
	if batch_img_metas is None:
	# Use input feature size to estimate image size
	featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
	strides = self.strides

	# Calculate original image size based on feature map sizes and strides
	# This is approximate but works for most cases
	upscaled_sizes = []
	for i, featmap_size in enumerate(featmap_sizes):
	h, w = featmap_size
	upscaled_sizes.append((h * strides[i], w * strides[i]))

	# Use the maximum size across levels
	img_h = max(s[0] for s in upscaled_sizes)
	img_w = max(s[1] for s in upscaled_sizes)

	batch_img_metas = [{
	'img_shape': (img_h, img_w, 3),
	'scale_factor': [1.0, 1.0, 1.0, 1.0]
	} for _ in range(batch_size)]

	# Get feature map sizes
	featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]

	# Generate grid points for each level
	mlvl_priors = self.prior_generator.grid_priors(
	featmap_sizes,
	dtype=cls_scores[0].dtype,
	device=device,
	with_stride=True)

	result_list = []
	for img_id in range(batch_size):
	img_meta = batch_img_metas[img_id]
	cls_score_list = [
	cls_scores[i][img_id].detach() for i in range(num_levels)
	]
	bbox_pred_list = [
	bbox_preds[i][img_id].detach() for i in range(num_levels)
	]

	results = self._predict_by_feat_single(
	cls_score_list,
	bbox_pred_list,
	mlvl_priors,
	img_meta,
	score_thr=score_thr,
	nms_iou_threshold=nms_iou_threshold,
	max_per_img=max_per_img,
	rescale=rescale,
	with_nms=with_nms
	)
	result_list.append(results)

	# Convert the results to a more standardized format
	boxes_batch = []
	scores_batch = []
	labels_batch = []

	for result in result_list:
	boxes = result['bboxes']
	scores = result.get('scores', boxes[:, -1])
	labels = result['labels']

	# Ensure boxes have only coordinates (some implementations add score as 5th column)
	if boxes.shape[1] > 4:
	boxes = boxes[:, :4]

	boxes_batch.append(boxes)
	scores_batch.append(scores)
	labels_batch.append(labels)

	# Stack results if there's at least one detection in each image
	if all(len(boxes) > 0 for boxes in boxes_batch):
	return DetectionOutput(
	boxes=torch.stack(boxes_batch),
	scores=torch.stack(scores_batch),
	labels=torch.stack(labels_batch)
	)

	# Handle case where some images have no detections
	max_num = max(len(boxes) for boxes in boxes_batch)
	if max_num == 0:
	# No detections at all
	dummy = torch.zeros((batch_size, 0, 4), device=device)
	return DetectionOutput(
	boxes=dummy,
	scores=torch.zeros((batch_size, 0), device=device),
	labels=torch.zeros((batch_size, 0), dtype=torch.long, device=device)
	)

	# Pad results to have consistent tensor shapes
	padded_boxes = []
	padded_scores = []
	padded_labels = []

	for boxes, scores, labels in zip(boxes_batch, scores_batch, labels_batch):
	num_dets = len(boxes)
	if num_dets == 0:
	padded_boxes.append(torch.zeros((max_num, 4), device=device))
	padded_scores.append(torch.zeros(max_num, device=device))
	padded_labels.append(torch.zeros(max_num, dtype=torch.long, device=device))
	else:
	padding = torch.zeros((max_num - num_dets, 4), device=device)
	padded_boxes.append(torch.cat([boxes, padding], dim=0))

	padding = torch.zeros(max_num - num_dets, device=device)
	padded_scores.append(torch.cat([scores, padding], dim=0))

	padding = torch.zeros(max_num - num_dets, dtype=torch.long, device=device)
	padded_labels.append(torch.cat([labels, padding], dim=0))

	return DetectionOutput(
	boxes=torch.stack(padded_boxes),
	scores=torch.stack(padded_scores),
	labels=torch.stack(padded_labels)
	)

	def _predict_by_feat_single(self, cls_score_list, bbox_pred_list, mlvl_priors,
	img_meta, score_thr=0.05, nms_iou_threshold=0.6,
	max_per_img=100, rescale=False, with_nms=True):
	"""Transform outputs of a single image into bbox predictions.

	This is a simplified version for inference only.
	"""
	# For each scale level
	mlvl_bboxes = []
	mlvl_scores = []

	for level_idx, (cls_score, bbox_pred, priors) in enumerate(
	zip(cls_score_list, bbox_pred_list, mlvl_priors)):
	assert cls_score.size()[-2:] == bbox_pred.size()[-2:]

	# Reshape
	cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels)
	bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)

	# Get scores
	scores = torch.sigmoid(cls_score)

	# Find high-scoring predictions
	max_scores, _ = scores.max(dim=1)
	keep_mask = max_scores > score_thr
	scores = scores[keep_mask]
	bbox_pred = bbox_pred[keep_mask]
	priors = priors[keep_mask]

	# If no valid predictions for this level, continue
	if scores.numel() == 0:
	continue

	# Decode bboxes
	bboxes = self._decode_bboxes(priors, bbox_pred, img_meta.get('img_shape'))

	mlvl_bboxes.append(bboxes)
	mlvl_scores.append(scores)

	# Combine all levels
	if len(mlvl_bboxes) == 0:
	# Return empty result if no valid predictions
	return {
	'bboxes': torch.zeros((0, 4), device=cls_score_list[0].device),
	'scores': torch.zeros((0,), device=cls_score_list[0].device),
	'labels': torch.zeros((0,), device=cls_score_list[0].device, dtype=torch.long)
	}

	bboxes = torch.cat(mlvl_bboxes)
	scores = torch.cat(mlvl_scores)

	# Optional rescaling to original image size
	if rescale and 'scale_factor' in img_meta:
	bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat((1, 2))

	# Apply NMS for each class
	if with_nms:
	det_bboxes, det_labels = self._nms(bboxes, scores,
	nms_iou_threshold,
	max_per_img)
	else:
	# Just return top k scores without NMS
	scores_flattened = scores.flatten()
	if scores_flattened.size(0) > max_per_img:
	top_scores, indices = scores_flattened.topk(max_per_img)
	scores_top_k = scores.view(-1, self.num_classes).index_select(0, indices)
	bboxes_top_k = bboxes.index_select(0, indices)
	labels_top_k = indices % self.num_classes
	det_bboxes = torch.cat([bboxes_top_k, top_scores.unsqueeze(-1)], dim=1)
	det_labels = labels_top_k
	else:
	# Convert to the same format with NMS
	num_bboxes = bboxes.size(0)
	max_scores, labels = scores.max(dim=1)
	det_bboxes = torch.cat([bboxes, max_scores.unsqueeze(-1)], dim=1)
	det_labels = labels

	return {
	'bboxes': det_bboxes,
	'scores': det_bboxes[:, -1],
	'labels': det_labels
	}

	def _decode_bboxes(self, priors, distance, max_shape=None):
	"""Decode distance predictions to bounding box coordinates."""
	# Get xy coordinates of priors (grid points)
	xy = priors[..., :2]

	# Distance predictions to 4 boundaries (left, top, right, bottom)
	# distances = [l, t, r, b]

	# Calculate bbox coordinates
	x1 = xy[..., 0] - distance[..., 0]
	y1 = xy[..., 1] - distance[..., 1]
	x2 = xy[..., 0] + distance[..., 2]
	y2 = xy[..., 1] + distance[..., 3]

	bboxes = torch.stack([x1, y1, x2, y2], -1)

	# Clip boxes to image boundaries if needed
	if max_shape is not None:
	bboxes[..., 0].clamp_(min=0, max=max_shape[1])
	bboxes[..., 1].clamp_(min=0, max=max_shape[0])
	bboxes[..., 2].clamp_(min=0, max=max_shape[1])
	bboxes[..., 3].clamp_(min=0, max=max_shape[0])

	return bboxes

	def _nms(self, bboxes, scores, iou_threshold, max_per_img):
	"""Apply NMS to detection results."""
	# For each class
	num_classes = scores.shape[1]
	det_bboxes = []
	det_labels = []

	for cls_idx in range(num_classes):
	cls_scores = scores[:, cls_idx]
	keep_idx = cls_scores > 0.05 # Apply score threshold

	if not keep_idx.any():
	continue

	cls_bboxes = bboxes[keep_idx]
	cls_scores = cls_scores[keep_idx]

	# Apply NMS for this class
	keep = self._batched_nms(cls_bboxes, cls_scores, iou_threshold)
	keep = keep[:max_per_img]

	det_bboxes.append(torch.cat([cls_bboxes[keep], cls_scores[keep].unsqueeze(-1)], dim=1))
	det_labels.append(cls_bboxes.new_full((keep.size(0),), cls_idx, dtype=torch.long))

	if len(det_bboxes) > 0:
	det_bboxes = torch.cat(det_bboxes, dim=0)
	det_labels = torch.cat(det_labels, dim=0)

	# Sort by score
	_, indices = det_bboxes[:, -1].sort(descending=True)
	det_bboxes = det_bboxes[indices]
	det_labels = det_labels[indices]

	# Limit to max_per_img
	det_bboxes = det_bboxes[:max_per_img]
	det_labels = det_labels[:max_per_img]
	else:
	# Return empty tensors if no detections
	det_bboxes = bboxes.new_zeros((0, 5))
	det_labels = bboxes.new_zeros((0,), dtype=torch.long)

	return det_bboxes, det_labels

	def _batched_nms(self, boxes, scores, iou_threshold):
	"""Performs non-maximum suppression on a batch of boxes."""
	if boxes.shape[0] == 0:
	return boxes.new_zeros(0, dtype=torch.long)

	try:
	# Try to use torchvision NMS for speed if available
	return torchvision.ops.nms(boxes, scores, iou_threshold)
	except:
	# Fall back to manual NMS implementation
	x1 = boxes[:, 0]
	y1 = boxes[:, 1]
	x2 = boxes[:, 2]
	y2 = boxes[:, 3]
	areas = (x2 - x1) * (y2 - y1)
	_, order = scores.sort(descending=True)

	keep = []
	while order.size(0) > 0:
	i = order[0].item()
	keep.append(i)

	if order.size(0) == 1:
	break

	xx1 = torch.max(x1[order[1:]], x1[i])
	yy1 = torch.max(y1[order[1:]], y1[i])
	xx2 = torch.min(x2[order[1:]], x2[i])
	yy2 = torch.min(y2[order[1:]], y2[i])

	w = torch.clamp(xx2 - xx1, min=0)
	h = torch.clamp(yy2 - yy1, min=0)
	inter = w * h

	iou = inter / (areas[i] + areas[order[1:]] - inter)

	inds = torch.where(iou <= iou_threshold)[0]
	order = order[inds + 1]

	return torch.tensor(keep, dtype=torch.long, device=boxes.device)


	class RTMDetModel(PreTrainedModel):
	"""
	RTMDet object detection model compatible with Hugging Face transformers.
	Updated implementation using PyTorch only with no NumPy or OpenCV dependencies.

	This model consists of a backbone, neck, and detection head for object detection.
	"""

	config_class = RTMDetConfig
	base_model_prefix = "rtmdet"
	main_input_name = "pixel_values"
	# share_conv aliases BN modules across strides; override tied-weights
	# method so transformers never calls get_parameter() on BN buffers.
	_tied_weights_keys = None

	def mark_tied_weights_as_initialized(self):
	pass # share_conv makes BN buffers look tied — skip to avoid AttributeError

	def __init__(self, config):
	super().__init__(config)

	# Build backbone
	self.backbone = CSPNeXt(
	arch=config.backbone_arch,
	deepen_factor=config.backbone_deepen_factor,
	widen_factor=config.backbone_widen_factor,
	expand_ratio=config.backbone_expand_ratio,
	channel_attention=config.backbone_channel_attention,
	use_depthwise=False,
	)

	# Build neck
	self.neck = CSPNeXtPAFPN(
	in_channels=config.neck_in_channels,
	out_channels=config.neck_out_channels,
	num_csp_blocks=config.neck_num_csp_blocks,
	expand_ratio=config.neck_expand_ratio,
	use_depthwise=False,
	)

	# Build head
	self.bbox_head = RTMDetSepBNHead(
	num_classes=config.num_classes,
	in_channels=config.head_in_channels,
	stacked_convs=config.head_stacked_convs,
	feat_channels=config.head_feat_channels,
	with_objectness=config.head_with_objectness,
	exp_on_reg=config.head_exp_on_reg,
	share_conv=config.head_share_conv,
	pred_kernel_size=config.head_pred_kernel_size,
	strides=config.strides,
	use_depthwise=False
	)

	# Initialize weights
	self.init_weights()
	# Required: triggers post_init() which sets all_tied_weights_keys etc.
	self.post_init()

	def init_weights(self):
	"""Initialize the weights of the model."""
	# Backbone is usually initialized from pre-trained weights
	# so we don't need special initialization

	# Initialize head
	self.bbox_head.init_weights()

	def forward(
	self,
	pixel_values=None,
	original_size=None,
	labels=None,
	output_hidden_states=None,
	return_dict=None,
	):
	"""
	Forward pass of the model.

	Args:
	pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
	Pixel values resized to 640×640 by the image processor.
	original_size (`Tuple[int, int]`, optional):
	``(height, width)`` of the original image before preprocessing.
	When supplied, the returned boxes are automatically scaled from
	640×640 model-input space to original image pixel coordinates so
	the caller never needs to compute ``sx = orig_w / 640`` manually.
	All images in the batch are assumed to share the same original size.
	labels (`List[Dict]`, optional):
	Labels for computing the detection loss.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers.
	return_dict (`bool`, optional):
	Whether or not to return a ModelOutput instead of a plain tuple.

	Returns:
	`DetectionOutput` or `tuple`:
	Boxes are in 640×640 space by default, or in original image space
	when ``original_size`` is provided.
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# Get inputs
	if pixel_values is None:
	raise ValueError("You have to specify pixel_values")

	batch_size, channels, height, width = pixel_values.shape

	# Extract features from backbone
	backbone_features = self.backbone(pixel_values)

	# Process features through neck
	neck_features = self.neck(backbone_features)

	# Get cls_scores and bbox_preds from head
	cls_scores, bbox_preds = self.bbox_head(neck_features)

	if labels is not None:
	# Training mode: calculate loss (not implemented in this simplified version)
	loss = torch.tensor(0.0, device=pixel_values.device)
	if return_dict:
	return DetectionOutput(loss=loss)
	else:
	return (loss,)

	# Inference mode: Get detection results
	# Create default batch_img_metas for prediction
	batch_img_metas = [{
	'img_shape': (height, width, 3),
	'scale_factor': [1.0, 1.0, 1.0, 1.0]
	} for _ in range(batch_size)]

	# Call predict method with parameters from config
	results = self.bbox_head.predict(
	cls_scores=cls_scores,
	bbox_preds=bbox_preds,
	batch_img_metas=batch_img_metas,
	rescale=False,
	with_nms=True,
	score_thr=self.config.score_threshold,
	nms_iou_threshold=self.config.nms_threshold,
	max_per_img=self.config.max_detections
	)

	# Scale boxes from 640×640 model space → original image space if requested
	if original_size is not None:
	orig_h, orig_w = original_size
	sx = orig_w / width # width == 640
	sy = orig_h / height # height == 640
	scaled_boxes = results.boxes.clone()
	scaled_boxes[..., 0] *= sx # x1
	scaled_boxes[..., 2] *= sx # x2
	scaled_boxes[..., 1] *= sy # y1
	scaled_boxes[..., 3] *= sy # y2
	results = DetectionOutput(
	boxes=scaled_boxes,
	scores=results.scores,
	labels=results.labels,
	)

	if return_dict:
	return results
	else:
	# Return as tuple (boxes, scores, labels)
	return (results.boxes, results.scores, results.labels)