ConSec / ConSec.py

End of training

fcbc792 verified 11 days ago

8.7 kB

	from collections.abc import Generator, Iterable
	from dataclasses import dataclass
	from enum import StrEnum

	from nltk.tokenize import TreebankWordDetokenizer

	import torch
	import torch.nn as nn
	from transformers import (
	AutoConfig,
	AutoModel,
	BatchEncoding,
	DebertaV2Model,
	PreTrainedConfig,
	PreTrainedModel,
	PreTrainedTokenizer,
	)
	from transformers.modeling_outputs import TokenClassifierOutput

	class ModelURI(StrEnum):
	BASE = "microsoft/deberta-v3-base"
	LARGE = "microsoft/deberta-v3-large"

	class ConSec(PreTrainedModel):
	def __init__(self, config: PreTrainedConfig):
	super().__init__(config)
	if config.init_basemodel:
	self.BaseModel = AutoModel.from_pretrained(config.name_or_path,
	device_map="auto",
	dtype=torch.bfloat16)
	self.config.vocab_size += 2
	self.BaseModel.resize_token_embeddings(self.config.vocab_size)
	else:
	self.BaseModel = DebertaV2Model(config)
	config.init_basemodel = False

	self.loss = nn.CrossEntropyLoss()
	self.post_init()

	@classmethod
	def from_base(cls, base_id: ModelURI):
	config = AutoConfig.from_pretrained(base_id)
	config.init_basemodel = True
	return cls(config)

	def add_special_tokens(self, start: int, end: int, gloss: int):
	self.config.start_token = start
	self.config.end_token = end
	self.config.gloss_token = gloss

	def forward(self,
	input_ids: torch.Tensor \| None = None,
	attention_mask: torch.Tensor \| None = None,
	token_type_ids: torch.Tensor \| None = None,
	position_ids: torch.Tensor \| None = None,
	inputs_embeds: torch.Tensor \| None = None,
	labels: torch.Tensor \| None = None,
	output_attentions: bool \| None = None,
	output_hidden_states: bool \| None = None,
	return_dict: bool \| None = None,
	**kwargs)->TokenClassifierOutput:
	base_model_output = self.BaseModel(input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	position_ids=position_ids,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	**kwargs)
	token_vectors = base_model_output.last_hidden_state
	selection = torch.zeros_like(input_ids, dtype=token_vectors.dtype)
	starts = (input_ids == self.config.start_token).nonzero()
	ends = (input_ids == self.config.end_token).nonzero()
	for startpos, endpos in zip(starts, ends, strict=True):
	selection[startpos[0], startpos[1] : endpos[1] + 1] = 1.0
	entity_vectors = torch.einsum("ijk,ij->ik", token_vectors, selection)
	gloss_vectors = self.gloss_vectors(
	input_ids, starts, position_ids, token_vectors
	)
	logits = torch.einsum("ij,ikj->ik", entity_vectors, gloss_vectors)

	return TokenClassifierOutput(
	logits=logits,
	loss=self.loss(logits, labels) if labels is not None else None,
	hidden_states=base_model_output.hidden_states if output_hidden_states else None,
	attentions=base_model_output.attentions if output_attentions else None,
	)

	def gloss_vectors(self,input_ids: torch.Tensor,
	starts: torch.Tensor,
	position_ids: torch.Tensor,
	token_vectors: torch.Tensor)->torch.Tensor:
	with self.device:
	vectors = [token_vectors[i,((position_ids[i]==position_ids[i,j])&(input_ids[i]==self.config.gloss_token))]
	for (i,j) in starts]
	maxlen = max(vector.shape[0] for vector in vectors)
	return torch.stack([torch.cat([vector,torch.zeros((maxlen-vector.shape[0],vector.shape[1]),
	dtype=torch.bfloat16)])
	for vector in vectors])

	def json_sequencer(sentence:list[dict])->Generator[tuple[list[str], list[str], int]]:
	for site in sorted([{"span":i,
	"n_candidates":len(chunk["candidates"])}
	for (i,chunk) in enumerate(sentence)
	if "candidates" in chunk],
	key = lambda x: x["n_candidates"]):
	words = [word for chunk in sentence[:site["span"]]
	for word in chunk["words"]]
	words.append("[START]")
	words.extend(sentence[site["span"]]["words"])
	words.append("[END]")
	words.extend([word for chunk in sentence[site["span"]+1:]
	for word in chunk["words"]])
	yield (words,
	sentence[site["span"]]["candidates"],
	site["span"])

	def json_labeller(sentence,tags):
	for tag in tags:
	sentence[tag["index"]]["label"]=tag["label"]
	return sentence

	class ConSecTagger:
	def __init__(self,model,
	tokenizer,
	ontology,
	sequencer=json_sequencer,
	labeller=json_labeller):
	self.model = model
	self.tokenizer = tokenizer
	special_tokens = self.tokenizer.get_added_vocab()
	self.start_token = special_tokens["[START]"]
	self.gloss_token = special_tokens["[GLOSS]"]
	self.sequencer = sequencer
	self.detokenizer = TreebankWordDetokenizer()
	self.glosses = {synset.concept:synset.definition
	for synset in ontology}
	self.label=labeller


	def __call__(self,sentence):
	already_tagged = []
	for (words,candidates,index) in self.sequencer(sentence):
	text = self.detokenizer.detokenize(words)
	glosses = ['']
	glosses.extend([self.glosses[candidate] for candidate in candidates])
	glosses.extend([self.glosses[previous["label"]] for previous in already_tagged])
	with self.model.device:
	tokens = self.tokenizer(text,"[GLOSS] ".join(glosses),
	return_tensors="pt")
	length = tokens.input_ids.shape[1]
	positions = torch.arange(length)
	place = (tokens.input_ids==self.start_token).nonzero(as_tuple=True)[1].item()
	wordpos = tokens.token_to_word(place)
	gloss_positions = [index.item()
	for index in (tokens.input_ids==self.gloss_token).nonzero(as_tuple=True)[1]]
	gloss_positions.append(length)
	n_candidates = len(candidates)
	for (i,position) in enumerate(gloss_positions[:-1]):
	if i<n_candidates:
	end = (place + gloss_positions[i+1]-position)
	positions[position:gloss_positions[i+1]] = torch.arange(place,end)
	else:
	known = already_tagged[i-n_candidates]
	start = tokens.word_to_tokens(known["place"]).start
	end = (start + gloss_positions[i+1] - position)
	positions[position:gloss_positions[i+1]] = torch.arange(start,end)
	prediction = self.model(input_ids=tokens.input_ids,
	attention_mask=tokens.attention_mask,
	token_type_ids=tokens.token_type_ids,
	position_ids=positions.reshape((1,length)))
	try:
	label = candidates[prediction.logits.argmax()]
	except IndexError:
	print(text)
	print(gloss_positions)
	print([positions[pos].item() for pos in gloss_positions[:-1]])
	print(already_tagged)
	print(candidates)
	print(prediction.logits)
	print(prediction.logits.argmax())
	raise
	already_tagged.append({"label":label,
	"place":wordpos,
	"index":index})
	return(self.label(sentence,already_tagged))