Spaces:

BorisEm
/

HATSAT

Running

App Files Files Community

HATSAT / model /components.py

BorisEm

Broke down code base into smaller files for readibility

0def483 6 months ago

raw

history blame contribute delete

17.4 kB

	"""
	HAT model components and building blocks.
	"""

	import torch
	import torch.nn as nn
	import math
	from einops import rearrange


	def to_2tuple(x):
	"""Convert input to tuple of length 2."""
	if isinstance(x, (tuple, list)):
	return tuple(x)
	return (x, x)


	def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
	"""Truncated normal initialization."""
	def norm_cdf(x):
	return (1. + math.erf(x / math.sqrt(2.))) / 2.

	with torch.no_grad():
	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)
	tensor.uniform_(2 * l - 1, 2 * u - 1)
	tensor.erfinv_()
	tensor.mul_(std * math.sqrt(2.))
	tensor.add_(mean)
	tensor.clamp_(min=a, max=b)
	return tensor


	def drop_path(x, drop_prob: float = 0., training: bool = False):
	if drop_prob == 0. or not training:
	return x
	keep_prob = 1 - drop_prob
	shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
	random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
	random_tensor.floor_()
	output = x.div(keep_prob) * random_tensor
	return output


	class DropPath(nn.Module):
	def __init__(self, drop_prob=None):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training)


	class ChannelAttention(nn.Module):
	def __init__(self, num_feat, squeeze_factor=16):
	super(ChannelAttention, self).__init__()
	self.attention = nn.Sequential(
	nn.AdaptiveAvgPool2d(1),
	nn.Conv2d(num_feat, num_feat // squeeze_factor, 1, padding=0),
	nn.ReLU(inplace=True),
	nn.Conv2d(num_feat // squeeze_factor, num_feat, 1, padding=0),
	nn.Sigmoid())

	def forward(self, x):
	y = self.attention(x)
	return x * y


	class CAB(nn.Module):
	def __init__(self, num_feat, compress_ratio=3, squeeze_factor=30):
	super(CAB, self).__init__()
	self.cab = nn.Sequential(
	nn.Conv2d(num_feat, num_feat // compress_ratio, 3, 1, 1),
	nn.GELU(),
	nn.Conv2d(num_feat // compress_ratio, num_feat, 3, 1, 1),
	ChannelAttention(num_feat, squeeze_factor)
	)

	def forward(self, x):
	return self.cab(x)


	class Mlp(nn.Module):
	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	def window_partition(x, window_size):
	b, h, w, c = x.shape
	x = x.view(b, h // window_size, window_size, w // window_size, window_size, c)
	windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c)
	return windows


	def window_reverse(windows, window_size, h, w):
	b = int(windows.shape[0] / (h * w / window_size / window_size))
	x = windows.view(b, h // window_size, w // window_size, window_size, window_size, -1)
	x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
	return x


	class WindowAttention(nn.Module):
	def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
	super().__init__()
	self.dim = dim
	self.window_size = window_size
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim**-0.5

	self.relative_position_bias_table = nn.Parameter(
	torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	trunc_normal_(self.relative_position_bias_table, std=.02)
	self.softmax = nn.Softmax(dim=-1)

	def forward(self, x, rpi, mask=None):
	b_, n, c = x.shape
	qkv = self.qkv(x).reshape(b_, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2]

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))

	relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view(
	self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
	relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
	attn = attn + relative_position_bias.unsqueeze(0)

	if mask is not None:
	nw = mask.shape[0]
	attn = attn.view(b_ // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
	attn = attn.view(-1, self.num_heads, n, n)
	attn = self.softmax(attn)
	else:
	attn = self.softmax(attn)

	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(b_, n, c)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	class HAB(nn.Module):
	def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
	compress_ratio=3, squeeze_factor=30, conv_scale=0.01, mlp_ratio=4.,
	qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
	act_layer=nn.GELU, norm_layer=nn.LayerNorm):
	super().__init__()
	self.dim = dim
	self.input_resolution = input_resolution
	self.num_heads = num_heads
	self.window_size = window_size
	self.shift_size = shift_size
	self.mlp_ratio = mlp_ratio
	if min(self.input_resolution) <= self.window_size:
	self.shift_size = 0
	self.window_size = min(self.input_resolution)
	assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'

	self.norm1 = norm_layer(dim)
	self.attn = WindowAttention(
	dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
	qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

	self.conv_scale = conv_scale
	self.conv_block = CAB(num_feat=dim, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor)

	self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

	def forward(self, x, x_size, rpi_sa, attn_mask):
	h, w = x_size
	b, _, c = x.shape

	shortcut = x
	x = self.norm1(x)
	x = x.view(b, h, w, c)

	# Conv_X
	conv_x = self.conv_block(x.permute(0, 3, 1, 2))
	conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(b, h * w, c)

	# cyclic shift
	if self.shift_size > 0:
	shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
	attn_mask = attn_mask
	else:
	shifted_x = x
	attn_mask = None

	# partition windows
	x_windows = window_partition(shifted_x, self.window_size)
	x_windows = x_windows.view(-1, self.window_size * self.window_size, c)

	# W-MSA/SW-MSA
	attn_windows = self.attn(x_windows, rpi=rpi_sa, mask=attn_mask)

	# merge windows
	attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c)
	shifted_x = window_reverse(attn_windows, self.window_size, h, w)

	# reverse cyclic shift
	if self.shift_size > 0:
	attn_x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
	else:
	attn_x = shifted_x
	attn_x = attn_x.view(b, h * w, c)

	# FFN
	x = shortcut + self.drop_path(attn_x) + conv_x * self.conv_scale
	x = x + self.drop_path(self.mlp(self.norm2(x)))

	return x


	class OCAB(nn.Module):
	def __init__(self, dim, input_resolution, window_size, overlap_ratio, num_heads,
	qkv_bias=True, qk_scale=None, mlp_ratio=2, norm_layer=nn.LayerNorm):
	super().__init__()
	self.dim = dim
	self.input_resolution = input_resolution
	self.window_size = window_size
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = qk_scale or head_dim**-0.5
	self.overlap_win_size = int(window_size * overlap_ratio) + window_size

	self.norm1 = norm_layer(dim)
	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.unfold = nn.Unfold(kernel_size=(self.overlap_win_size, self.overlap_win_size),
	stride=window_size, padding=(self.overlap_win_size-window_size)//2)

	self.relative_position_bias_table = nn.Parameter(
	torch.zeros((window_size + self.overlap_win_size - 1) * (window_size + self.overlap_win_size - 1), num_heads))

	trunc_normal_(self.relative_position_bias_table, std=.02)
	self.softmax = nn.Softmax(dim=-1)

	self.proj = nn.Linear(dim,dim)

	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU)

	def forward(self, x, x_size, rpi):
	h, w = x_size
	b, _, c = x.shape

	shortcut = x
	x = self.norm1(x)
	x = x.view(b, h, w, c)

	qkv = self.qkv(x).reshape(b, h, w, 3, c).permute(3, 0, 4, 1, 2)
	q = qkv[0].permute(0, 2, 3, 1)
	kv = torch.cat((qkv[1], qkv[2]), dim=1)

	# partition windows
	q_windows = window_partition(q, self.window_size)
	q_windows = q_windows.view(-1, self.window_size * self.window_size, c)

	kv_windows = self.unfold(kv)
	kv_windows = rearrange(kv_windows, 'b (nc ch owh oww) nw -> nc (b nw) (owh oww) ch',
	nc=2, ch=c, owh=self.overlap_win_size, oww=self.overlap_win_size).contiguous()
	k_windows, v_windows = kv_windows[0], kv_windows[1]

	b_, nq, _ = q_windows.shape
	_, n, _ = k_windows.shape
	d = self.dim // self.num_heads
	q = q_windows.reshape(b_, nq, self.num_heads, d).permute(0, 2, 1, 3)
	k = k_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3)
	v = v_windows.reshape(b_, n, self.num_heads, d).permute(0, 2, 1, 3)

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))

	relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view(
	self.window_size * self.window_size, self.overlap_win_size * self.overlap_win_size, -1)
	relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
	attn = attn + relative_position_bias.unsqueeze(0)

	attn = self.softmax(attn)
	attn_windows = (attn @ v).transpose(1, 2).reshape(b_, nq, self.dim)

	# merge windows
	attn_windows = attn_windows.view(-1, self.window_size, self.window_size, self.dim)
	x = window_reverse(attn_windows, self.window_size, h, w)
	x = x.view(b, h * w, self.dim)

	x = self.proj(x) + shortcut
	x = x + self.mlp(self.norm2(x))
	return x


	class AttenBlocks(nn.Module):
	def __init__(self, dim, input_resolution, depth, num_heads, window_size, compress_ratio,
	squeeze_factor, conv_scale, overlap_ratio, mlp_ratio=4., qkv_bias=True, qk_scale=None,
	drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
	use_checkpoint=False):
	super().__init__()
	self.dim = dim
	self.input_resolution = input_resolution
	self.depth = depth
	self.use_checkpoint = use_checkpoint

	# build blocks
	self.blocks = nn.ModuleList([
	HAB(dim=dim, input_resolution=input_resolution, num_heads=num_heads, window_size=window_size,
	shift_size=0 if (i % 2 == 0) else window_size // 2, compress_ratio=compress_ratio,
	squeeze_factor=squeeze_factor, conv_scale=conv_scale, mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
	drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
	norm_layer=norm_layer) for i in range(depth)
	])

	# OCAB
	self.overlap_attn = OCAB(dim=dim, input_resolution=input_resolution, window_size=window_size,
	overlap_ratio=overlap_ratio, num_heads=num_heads, qkv_bias=qkv_bias,
	qk_scale=qk_scale, mlp_ratio=mlp_ratio, norm_layer=norm_layer)

	# patch merging layer
	if downsample is not None:
	self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
	else:
	self.downsample = None

	def forward(self, x, x_size, params):
	for blk in self.blocks:
	x = blk(x, x_size, params['rpi_sa'], params['attn_mask'])

	x = self.overlap_attn(x, x_size, params['rpi_oca'])

	if self.downsample is not None:
	x = self.downsample(x)
	return x


	class RHAG(nn.Module):
	def __init__(self, dim, input_resolution, depth, num_heads, window_size, compress_ratio,
	squeeze_factor, conv_scale, overlap_ratio, mlp_ratio=4., qkv_bias=True, qk_scale=None,
	drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
	use_checkpoint=False, img_size=224, patch_size=4, resi_connection='1conv'):
	super(RHAG, self).__init__()

	self.dim = dim
	self.input_resolution = input_resolution

	self.residual_group = AttenBlocks(
	dim=dim, input_resolution=input_resolution, depth=depth, num_heads=num_heads,
	window_size=window_size, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor,
	conv_scale=conv_scale, overlap_ratio=overlap_ratio, mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop,
	drop_path=drop_path, norm_layer=norm_layer, downsample=downsample,
	use_checkpoint=use_checkpoint)

	if resi_connection == '1conv':
	self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
	elif resi_connection == 'identity':
	self.conv = nn.Identity()

	self.patch_embed = PatchEmbed(
	img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)

	self.patch_unembed = PatchUnEmbed(
	img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim, norm_layer=None)

	def forward(self, x, x_size, params):
	return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size, params), x_size))) + x


	class PatchEmbed(nn.Module):
	def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
	self.img_size = img_size
	self.patch_size = patch_size
	self.patches_resolution = patches_resolution
	self.num_patches = patches_resolution[0] * patches_resolution[1]

	self.in_chans = in_chans
	self.embed_dim = embed_dim

	if norm_layer is not None:
	self.norm = norm_layer(embed_dim)
	else:
	self.norm = None

	def forward(self, x):
	x = x.flatten(2).transpose(1, 2)
	if self.norm is not None:
	x = self.norm(x)
	return x


	class PatchUnEmbed(nn.Module):
	def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
	self.img_size = img_size
	self.patch_size = patch_size
	self.patches_resolution = patches_resolution
	self.num_patches = patches_resolution[0] * patches_resolution[1]

	self.in_chans = in_chans
	self.embed_dim = embed_dim

	def forward(self, x, x_size):
	x = x.transpose(1, 2).contiguous().view(x.shape[0], self.embed_dim, x_size[0], x_size[1])
	return x


	class Upsample(nn.Sequential):
	def __init__(self, scale, num_feat):
	m = []
	if (scale & (scale - 1)) == 0:
	for _ in range(int(math.log(scale, 2))):
	m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
	m.append(nn.PixelShuffle(2))
	elif scale == 3:
	m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
	m.append(nn.PixelShuffle(3))
	else:
	raise ValueError(f'scale {scale} is not supported. Supported scales: 2^n and 3.')
	super(Upsample, self).__init__(*m)