Spaces:

Spico
/

Mirror

Runtime error

App Files Files Community

Mirror / src /model.py

Spico

update

5953ef9 over 2 years ago

raw

history blame contribute delete

18.2 kB

	import torch
	import torch.nn as nn
	from rex.utils.iteration import windowed_queue_iter
	from transformers import AutoModel, BertModel

	from src.utils import decode_nnw_nsw_thw_mat, decode_nnw_thw_mat, decode_pointer_mat


	class Biaffine(nn.Module):
	"""Biaffine transformation

	References:
	- https://github.com/yzhangcs/parser/blob/main/supar/modules/affine.py
	- https://github.com/ljynlp/W2NER
	"""

	def __init__(self, n_in, n_out=2, bias_x=True, bias_y=True):
	super().__init__()

	self.n_in = n_in
	self.n_out = n_out
	self.bias_x = bias_x
	self.bias_y = bias_y
	weight = torch.zeros(n_out, n_in + int(bias_x), n_in + int(bias_y))
	nn.init.xavier_normal_(weight)
	self.weight = nn.Parameter(weight, requires_grad=True)

	def extra_repr(self):
	s = f"n_in={self.n_in}, n_out={self.n_out}"
	if self.bias_x:
	s += f", bias_x={self.bias_x}"
	if self.bias_y:
	s += f", bias_y={self.bias_y}"

	return s

	def forward(self, x, y):
	if self.bias_x:
	x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
	if self.bias_y:
	y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
	# [batch_size, n_out, seq_len, seq_len]
	s = torch.einsum("bxi,oij,byj->boxy", x, self.weight, y)
	# s = s.permute(0, 2, 3, 1)

	return s


	class LinearWithAct(nn.Module):
	def __init__(self, n_in, n_out, dropout=0) -> None:
	super().__init__()

	self.linear = nn.Linear(n_in, n_out)
	self.act_fn = nn.GELU()
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	x = self.linear(x)
	x = self.act_fn(x)
	x = self.dropout(x)
	return x


	class PointerMatrix(nn.Module):
	"""Pointer Matrix Prediction

	References:
	- https://github.com/ljynlp/W2NER
	"""

	def __init__(
	self,
	hidden_size,
	biaffine_size,
	cls_num=2,
	dropout=0,
	biaffine_bias=False,
	use_rope=False,
	):
	super().__init__()
	self.linear_h = LinearWithAct(
	n_in=hidden_size, n_out=biaffine_size, dropout=dropout
	)
	self.linear_t = LinearWithAct(
	n_in=hidden_size, n_out=biaffine_size, dropout=dropout
	)
	self.biaffine = Biaffine(
	n_in=biaffine_size,
	n_out=cls_num,
	bias_x=biaffine_bias,
	bias_y=biaffine_bias,
	)
	self.use_rope = use_rope

	def sinusoidal_position_embedding(self, qw, kw):
	batch_size, seq_len, output_dim = qw.shape
	position_ids = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(-1)

	indices = torch.arange(0, output_dim // 2, dtype=torch.float)
	indices = torch.pow(10000, -2 * indices / output_dim)
	pos_emb = position_ids * indices
	pos_emb = torch.stack([torch.sin(pos_emb), torch.cos(pos_emb)], dim=-1)
	pos_emb = pos_emb.repeat((batch_size, ([1] len(pos_emb.shape))))
	pos_emb = torch.reshape(pos_emb, (batch_size, seq_len, output_dim))
	pos_emb = pos_emb.to(qw)

	# (bs, seq_len, 1, hz) -> (bs, seq_len, hz)
	cos_pos = pos_emb[..., 1::2].repeat_interleave(2, dim=-1)
	# (bs, seq_len, 1, hz) -> (bs, seq_len, hz)
	sin_pos = pos_emb[..., ::2].repeat_interleave(2, dim=-1)
	qw2 = torch.cat([-qw[..., 1::2], qw[..., ::2]], -1)
	qw = qw * cos_pos + qw2 * sin_pos
	kw2 = torch.cat([-kw[..., 1::2], kw[..., ::2]], -1)
	kw = kw * cos_pos + kw2 * sin_pos
	return qw, kw

	def forward(self, x):
	h = self.linear_h(x)
	t = self.linear_t(x)
	if self.use_rope:
	h, t = self.sinusoidal_position_embedding(h, t)
	o = self.biaffine(h, t)
	return o


	def multilabel_categorical_crossentropy(y_pred, y_true, bit_mask=None):
	"""
	https://kexue.fm/archives/7359
	https://github.com/gaohongkui/GlobalPointer_pytorch/blob/main/common/utils.py
	"""
	y_pred = (1 - 2 * y_true) * y_pred # -1 -> pos classes, 1 -> neg classes
	y_pred_neg = y_pred - y_true * 1e12 # mask the pred outputs of pos classes
	y_pred_pos = y_pred - (1 - y_true) * 1e12 # mask the pred outputs of neg classes
	zeros = torch.zeros_like(y_pred[..., :1])
	y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1)
	y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1)
	neg_loss = torch.logsumexp(y_pred_neg, dim=-1)
	pos_loss = torch.logsumexp(y_pred_pos, dim=-1)

	if bit_mask is None:
	return neg_loss + pos_loss
	else:
	raise NotImplementedError


	class MrcPointerMatrixModel(nn.Module):
	def __init__(
	self,
	plm_dir: str,
	cls_num: int = 2,
	biaffine_size: int = 384,
	none_type_id: int = 0,
	text_mask_id: int = 4,
	dropout: float = 0.3,
	):
	super().__init__()

	# num of predicted classes, default is 3: None, NNW and THW
	self.cls_num = cls_num
	# None type id: 0, Next Neighboring Word (NNW): 1, Tail Head Word (THW): 2
	self.none_type_id = none_type_id
	# input: cls instruction sep text sep pad
	# mask: 1 2 3 4 5 0
	self.text_mask_id = text_mask_id

	self.plm = BertModel.from_pretrained(plm_dir)
	hidden_size = self.plm.config.hidden_size
	# self.biaffine_size = biaffine_size
	self.nnw_mat = PointerMatrix(
	hidden_size, biaffine_size, cls_num=2, dropout=dropout
	)
	self.thw_mat = PointerMatrix(
	hidden_size, biaffine_size, cls_num=2, dropout=dropout
	)
	self.criterion = nn.CrossEntropyLoss()

	def input_encoding(self, input_ids, mask):
	attention_mask = mask.gt(0).float()
	plm_outputs = self.plm(
	input_ids=input_ids,
	attention_mask=attention_mask,
	return_dict=True,
	)
	return plm_outputs.last_hidden_state

	def build_bit_mask(self, mask: torch.Tensor) -> torch.Tensor:
	# mask: (batch_size, seq_len)
	bs, seq_len = mask.shape
	mask_mat = (
	mask.eq(self.text_mask_id).unsqueeze(-1).expand((bs, seq_len, seq_len))
	)
	# bit_mask: (batch_size, seq_len, seq_len, 1)
	bit_mask = (
	torch.logical_and(mask_mat, mask_mat.transpose(1, 2)).unsqueeze(1).long()
	)
	return bit_mask

	def forward(self, input_ids, mask, labels=None, is_eval=False, **kwargs):
	hidden = self.input_encoding(input_ids, mask)
	nnw_hidden = self.nnw_mat(hidden)
	thw_hidden = self.thw_mat(hidden)
	# nnw_hidden = nnw_hidden / self.biaffine_size ** 0.5
	# thw_hidden = thw_hidden / self.biaffine_size ** 0.5
	# # (bs, 2, seq_len, seq_len)
	bs, _, seq_len, seq_len = nnw_hidden.shape

	bit_mask = self.build_bit_mask(mask)

	results = {"logits": {"nnw": nnw_hidden, "thw": thw_hidden}}
	if labels is not None:
	# mean
	nnw_loss = self.criterion(
	nnw_hidden.permute(0, 2, 3, 1).reshape(-1, 2),
	labels[:, 0, :, :].reshape(-1),
	)
	thw_loss = self.criterion(
	thw_hidden.permute(0, 2, 3, 1).reshape(-1, 2),
	labels[:, 1, :, :].reshape(-1),
	)
	loss = nnw_loss + thw_loss
	results["loss"] = loss

	if is_eval:
	batch_positions = self.decode(nnw_hidden, thw_hidden, bit_mask, **kwargs)
	results["pred"] = batch_positions
	return results

	def decode(
	self,
	nnw_hidden: torch.Tensor,
	thw_hidden: torch.Tensor,
	bit_mask: torch.Tensor,
	**kwargs,
	):
	# B x L x L
	nnw_pred = nnw_hidden.argmax(1)
	thw_pred = thw_hidden.argmax(1)
	# B x 2 x L x L
	pred = torch.stack([nnw_pred, thw_pred], dim=1)
	pred = pred * bit_mask

	batch_preds = decode_nnw_thw_mat(pred, offsets=kwargs.get("offset"))

	return batch_preds


	class MrcGlobalPointerModel(nn.Module):
	def __init__(
	self,
	plm_dir: str,
	use_rope: bool = True,
	cls_num: int = 2,
	biaffine_size: int = 384,
	none_type_id: int = 0,
	text_mask_id: int = 4,
	dropout: float = 0.3,
	mode: str = "w2",
	):
	super().__init__()

	# num of predicted classes, default is 3: None, NNW and THW
	self.cls_num = cls_num
	# None type id: 0, Next Neighboring Word (NNW): 1, Tail Head Word (THW): 2
	self.none_type_id = none_type_id
	# input: cls instruction sep text sep pad
	# mask: 1 2 3 4 5 0
	self.text_mask_id = text_mask_id
	self.use_rope = use_rope

	# mode: w2: w2ner, cons: consecutive spans
	self.mode = mode
	assert self.mode in ["w2", "cons"]

	self.plm = BertModel.from_pretrained(plm_dir)
	self.hidden_size = self.plm.config.hidden_size
	self.biaffine_size = biaffine_size
	self.pointer = PointerMatrix(
	self.hidden_size,
	biaffine_size,
	cls_num=2 if self.mode == "w2" else 1,
	dropout=dropout,
	biaffine_bias=True,
	use_rope=use_rope,
	)

	def input_encoding(self, input_ids, mask):
	attention_mask = mask.gt(0).float()
	plm_outputs = self.plm(
	input_ids=input_ids,
	attention_mask=attention_mask,
	return_dict=True,
	)
	return plm_outputs.last_hidden_state

	def build_bit_mask(self, mask: torch.Tensor) -> torch.Tensor:
	# mask: (batch_size, seq_len)
	bs, seq_len = mask.shape
	mask_mat = (
	mask.eq(self.text_mask_id).unsqueeze(-1).expand((bs, seq_len, seq_len))
	)
	# bit_mask: (batch_size, 1, seq_len, seq_len)
	bit_mask = (
	torch.logical_and(mask_mat, mask_mat.transpose(1, 2)).unsqueeze(1).float()
	)
	if self.mode == "cons":
	bit_mask = bit_mask.triu()

	return bit_mask

	def forward(
	self, input_ids, mask, labels=None, is_eval=False, top_p=0.5, top_k=-1, **kwargs
	):
	bit_mask = self.build_bit_mask(mask)
	hidden = self.input_encoding(input_ids, mask)
	# (bs, 2, seq_len, seq_len)
	logits = self.pointer(hidden)
	logits = logits * bit_mask - (1.0 - bit_mask) * 1e12
	logits = logits / (self.biaffine_size**0.5)
	# # (bs, 2, seq_len, seq_len)
	bs, cls_num, seq_len, seq_len = logits.shape
	assert labels.shape == (bs, cls_num, seq_len, seq_len)

	results = {"logits": logits}
	if labels is not None:
	loss = multilabel_categorical_crossentropy(
	logits.reshape(bs * cls_num, -1), labels.reshape(bs * cls_num, -1)
	)
	loss = loss.mean()
	results["loss"] = loss

	if is_eval:
	batch_positions = self.decode(logits, top_p=top_p, top_k=top_k, **kwargs)
	results["pred"] = batch_positions
	return results

	def calc_path_prob(self, probs, paths):
	"""
	Args:
	probs: (2, seq_len, seq_len) \| (1, seq_len, seq_len)
	paths: a list of paths in tuple

	Returns:
	[(path: tuple, prob: float), ...]
	"""
	assert self.mode in ["w2", "cons"]
	paths_with_prob = []
	for path in paths:
	path_prob = 1.0
	if self.mode == "w2":
	for se in windowed_queue_iter(path, 2, 1, drop_last=True):
	path_prob *= probs[0, se[0], se[-1]]
	path_prob *= probs[1, path[-1], path[0]]
	elif self.mode == "cons":
	path_prob = probs[0, path[0], path[-1]]
	paths_with_prob.append((path, path_prob))
	return paths_with_prob

	def decode(
	self,
	logits: torch.Tensor,
	top_p: float = 0.5,
	top_k: int = -1,
	**kwargs,
	):
	# mode: w2: w2ner with nnw and thw labels, cons: consecutive spans with one type of labels
	assert self.mode in ["w2", "cons"]
	# B x 2 x L x L
	probs = logits.sigmoid()
	pred = (probs > top_p).long()
	if self.mode == "w2":
	preds = decode_nnw_thw_mat(pred, offsets=kwargs.get("offset"))
	elif self.mode == "cons":
	pred = pred.triu()
	preds = decode_pointer_mat(pred, offsets=kwargs.get("offset"))

	if top_k == -1:
	batch_preds = preds
	else:
	batch_preds = []
	for i, paths in enumerate(preds):
	paths_with_prob = self.calc_path_prob(probs[i], paths)
	paths_with_prob.sort(key=lambda pp: pp[1], reverse=True)
	batch_preds.append([pp[0] for pp in paths_with_prob[:top_k]])

	return batch_preds


	class SchemaGuidedInstructBertModel(nn.Module):
	def __init__(
	self,
	plm_dir: str,
	vocab_size: int = None,
	use_rope: bool = True,
	biaffine_size: int = 512,
	label_mask_id: int = 4,
	text_mask_id: int = 7,
	dropout: float = 0.3,
	):
	super().__init__()

	# input: [CLS] [I] Instruction [LM] PER [LM] LOC [LM] ORG [TL] Text [B] Background [SEP] [PAD]
	# mask: 1 2 3 4 5 4 5 4 5 6 7 8 9 10 0
	self.label_mask_id = label_mask_id
	self.text_mask_id = text_mask_id
	self.use_rope = use_rope

	self.plm = AutoModel.from_pretrained(plm_dir)
	if vocab_size:
	self.plm.resize_token_embeddings(vocab_size)
	self.hidden_size = self.plm.config.hidden_size
	self.biaffine_size = biaffine_size
	self.pointer = PointerMatrix(
	self.hidden_size,
	biaffine_size,
	cls_num=3,
	dropout=dropout,
	biaffine_bias=True,
	use_rope=use_rope,
	)

	def input_encoding(self, input_ids, mask):
	attention_mask = mask.gt(0).float()
	plm_outputs = self.plm(
	input_ids=input_ids,
	attention_mask=attention_mask,
	return_dict=True,
	)
	return plm_outputs.last_hidden_state

	def build_bit_mask(self, mask: torch.Tensor) -> torch.Tensor:
	# mask: (batch_size, seq_len)
	bs, seq_len = mask.shape
	# _m = torch.logical_or(mask.eq(self.label_mask_id), mask.eq(self.text_mask_id))
	# mask_mat = _m.unsqueeze(-1).expand((bs, seq_len, seq_len))
	# # bit_mask: (batch_size, 1, seq_len, seq_len)
	# bit_mask = (
	# torch.logical_and(mask_mat, mask_mat.transpose(1, 2)).unsqueeze(1).float()
	# )
	bit_mask = (
	mask.gt(0).unsqueeze(1).unsqueeze(1).expand(bs, 1, seq_len, seq_len).float()
	)

	return bit_mask

	def forward(
	self, input_ids, mask, labels=None, is_eval=False, top_p=0.5, top_k=-1, **kwargs
	):
	bit_mask = self.build_bit_mask(mask)
	hidden = self.input_encoding(input_ids, mask)
	# (bs, 3, seq_len, seq_len)
	logits = self.pointer(hidden)
	logits = logits * bit_mask - (1.0 - bit_mask) * 1e12
	logits = logits / (self.biaffine_size**0.5)
	# # (bs, 3, seq_len, seq_len)
	bs, cls_num, seq_len, seq_len = logits.shape
	assert labels.shape == (bs, cls_num, seq_len, seq_len)

	results = {"logits": logits}
	if labels is not None:
	loss = multilabel_categorical_crossentropy(
	logits.reshape(bs * cls_num, -1), labels.reshape(bs * cls_num, -1)
	)
	loss = loss.mean()
	results["loss"] = loss

	if is_eval:
	batch_positions = self.decode(logits, top_p=top_p, top_k=top_k, **kwargs)
	results["pred"] = batch_positions
	return results

	def calc_path_prob(self, probs, paths):
	"""
	Args:
	probs: (2, seq_len, seq_len) \| (1, seq_len, seq_len)
	paths: a list of paths in tuple

	Returns:
	[(path: tuple, prob: float), ...]
	"""
	paths_with_prob = []
	for path in paths:
	path_prob = 1.0
	for se in windowed_queue_iter(path, 2, 1, drop_last=True):
	path_prob *= probs[0, se[0], se[-1]]
	path_prob *= probs[1, path[-1], path[0]]
	paths_with_prob.append((path, path_prob))
	return paths_with_prob

	def decode(
	self,
	logits: torch.Tensor,
	top_p: float = 0.5,
	top_k: int = -1,
	# legal_num_parts: tuple = (1, 2, 3),
	legal_num_parts: tuple = None,
	labels: torch.Tensor = None,
	**kwargs,
	):
	# B x 3 x L x L
	if labels is None:
	# `labels` is used for upper bound analysis
	probs = logits.sigmoid()
	pred = (probs > top_p).long()
	else:
	pred = labels
	preds = decode_nnw_nsw_thw_mat(pred, offsets=kwargs.get("offset"))
	# for pred, gold in zip(preds, kwargs.get("spans")):
	# sorted_pred = sorted(set(tuple(x) for x in pred))
	# sorted_gold = sorted(set(tuple(x) for x in gold))
	# if sorted_pred != sorted_gold:
	# breakpoint()

	if top_k == -1:
	batch_preds = preds
	else:
	batch_preds = []
	for i, paths in enumerate(preds):
	paths_with_prob = self.calc_path_prob(probs[i], paths)
	paths_with_prob.sort(key=lambda pp: pp[1], reverse=True)
	batch_preds.append([pp[0] for pp in paths_with_prob[:top_k]])

	if legal_num_parts is not None:
	legal_preds = []
	for ins_paths in batch_preds:
	legal_paths = []
	for path in ins_paths:
	if len(path) in legal_num_parts:
	legal_paths.append(path)
	legal_preds.append(legal_paths)
	else:
	legal_preds = batch_preds

	return legal_preds