varocarras
/

Humaneyes

Text Generation

text2text-generation

Model card Files Files and versions

Humaneyes / handler.py

varocarras's picture

Upload handler.py with huggingface_hub

acff0a0 verified 5 months ago

history blame contribute delete

4.23 kB

	from typing import Dict, List, Any
	import torch
	from transformers import PegasusForConditionalGeneration, PegasusTokenizer
	import re

	class EndpointHandler:
	def __init__(self, path=""):
	"""
	Initialize the endpoint handler with the model and tokenizer.

	:param path: Path to the model weights
	"""
	# Determine the device
	self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# Load tokenizer and model
	self.tokenizer = PegasusTokenizer.from_pretrained(path)
	self.model = PegasusForConditionalGeneration.from_pretrained(path).to(self.torch_device)

	def split_into_paragraphs(self, text: str) -> List[str]:
	"""
	Split text into paragraphs while preserving empty lines.

	:param text: Input text
	:return: List of paragraphs
	"""
	paragraphs = text.split('\n\n')
	return [p.strip() for p in paragraphs if p.strip()]

	def split_into_sentences(self, paragraph: str) -> List[str]:
	"""
	Split paragraph into sentences using regex.

	:param paragraph: Input paragraph
	:return: List of sentences
	"""
	sentences = re.split(r'(?<=[.!?])\s+', paragraph)
	return [s.strip() for s in sentences if s.strip()]

	def get_response(self, input_text: str, num_return_sequences: int = 1) -> str:
	"""
	Generate paraphrased text for a single input.

	:param input_text: Input sentence to paraphrase
	:param num_return_sequences: Number of alternative paraphrases to generate
	:return: Paraphrased text
	"""
	batch = self.tokenizer.prepare_seq2seq_batch(
	[input_text],
	truncation=True,
	padding='longest',
	max_length=80,
	return_tensors="pt"
	).to(self.torch_device)

	translated = self.model.generate(
	**batch,
	num_beams=10,
	num_return_sequences=num_return_sequences,
	temperature=1.0,
	repetition_penalty=2.8,
	length_penalty=1.2,
	max_length=80,
	min_length=5,
	no_repeat_ngram_size=3
	)

	tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
	return tgt_text[0]

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Process the incoming request and generate paraphrased text.

	:param data: Request payload containing input text
	:return: Paraphrased text
	"""
	# Extract input text from the payload
	inputs = data.pop("inputs", data)

	# If input is not a string, raise an error
	if not isinstance(inputs, str):
	raise ValueError("Input must be a string")

	# Split text into paragraphs
	paragraphs = self.split_into_paragraphs(inputs)
	paraphrased_paragraphs = []

	# Process each paragraph
	for paragraph in paragraphs:
	sentences = self.split_into_sentences(paragraph)
	paraphrased_sentences = []

	for sentence in sentences:
	# Skip very short sentences
	if len(sentence.split()) < 3:
	paraphrased_sentences.append(sentence)
	continue

	try:
	# Paraphrase the sentence
	paraphrased = self.get_response(sentence)

	# Avoid unwanted paraphrases
	if not any(phrase in paraphrased.lower() for phrase in ['it\'s like', 'in other words']):
	paraphrased_sentences.append(paraphrased)
	else:
	paraphrased_sentences.append(sentence)
	except Exception as e:
	print(f"Error processing sentence: {e}")
	paraphrased_sentences.append(sentence)

	# Join sentences back into a paragraph
	paraphrased_paragraphs.append(' '.join(paraphrased_sentences))

	# Join paragraphs back into text
	return {"outputs": '\n\n'.join(paraphrased_paragraphs)}