handler

handler / handler.py

white

Update handler.py

31fdbaa verified 10 months ago

5.88 kB

	import os
	import io
	import base64
	import torch
	import numpy as np
	from transformers import BarkModel, BarkProcessor
	from typing import Dict, List, Any

	class EndpointHandler:
	def __init__(self, path=""):
	"""
	Initialize the handler for Bark text-to-speech model.
	Args:
	path (str, optional): Path to the model directory. Defaults to "".
	"""
	self.path = path
	self.model = None
	self.processor = None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.initialized = False

	def setup(self, **kwargs):
	"""
	Load the model and processor.
	Args:
	**kwargs: Additional arguments.
	"""
	# Load model from the local directory
	self.model = BarkModel.from_pretrained(self.path)
	self.model.to(self.device)

	# Load processor
	self.processor = BarkProcessor.from_pretrained(self.path)

	self.initialized = True
	print(f"Bark model loaded on {self.device}")

	def preprocess(self, request: Dict) -> Dict:
	"""
	Process the input request before inference.
	Args:
	request (Dict): The request data containing text to convert to speech.
	Returns:
	Dict: Processed inputs for the model.
	"""
	if not self.initialized:
	self.setup()

	inputs = {}

	# Get text from the request
	if "inputs" in request:
	if isinstance(request["inputs"], str):
	# Single text input
	inputs["text"] = request["inputs"]
	elif isinstance(request["inputs"], list):
	# List of text inputs
	inputs["text"] = request["inputs"][0] # Take the first text

	# Get optional parameters
	params = request.get("parameters", {})

	# Speaker ID/voice preset
	if "speaker_id" in params:
	inputs["speaker_id"] = params["speaker_id"]
	elif "voice_preset" in params:
	inputs["voice_preset"] = params["voice_preset"]

	# Other generation parameters
	if "temperature" in params:
	inputs["temperature"] = params.get("temperature", 0.7)

	return inputs

	def inference(self, inputs: Dict) -> Dict:
	"""
	Run model inference on the processed inputs.
	Args:
	inputs (Dict): Processed inputs for the model.
	Returns:
	Dict: Model outputs.
	"""
	text = inputs.get("text", "")
	if not text:
	return {"error": "No text provided for speech generation"}

	# Extract optional parameters
	speaker_id = inputs.get("speaker_id", None)
	voice_preset = inputs.get("voice_preset", None)
	temperature = inputs.get("temperature", 0.7)

	# Prepare inputs for the model
	input_ids = self.processor(text).to(self.device)

	# Generate speech
	with torch.no_grad():
	if speaker_id:
	# Use speaker_id if provided
	speech_output = self.model.generate(
	input_ids=input_ids,
	speaker_id=speaker_id,
	temperature=temperature
	)
	elif voice_preset:
	# Use voice_preset if provided
	speech_output = self.model.generate(
	input_ids=input_ids,
	voice_preset=voice_preset,
	temperature=temperature
	)
	else:
	# Use default settings
	speech_output = self.model.generate(
	input_ids=input_ids,
	temperature=temperature
	)

	# Convert to numpy array
	audio_array = speech_output.cpu().numpy().squeeze()

	return {"audio_array": audio_array, "sample_rate": self.model.generation_config.sample_rate}

	def postprocess(self, inference_output: Dict) -> Dict:
	"""
	Process the model outputs after inference.
	Args:
	inference_output (Dict): Model outputs.
	Returns:
	Dict: Processed outputs ready for the response.
	"""
	if "error" in inference_output:
	return {"error": inference_output["error"]}

	audio_array = inference_output.get("audio_array")
	sample_rate = inference_output.get("sample_rate", 24000)

	# Convert audio array to WAV format
	try:
	import scipy.io.wavfile as wav
	audio_buffer = io.BytesIO()
	wav.write(audio_buffer, sample_rate, audio_array)
	audio_buffer.seek(0)
	audio_data = audio_buffer.read()

	# Encode audio data to base64
	audio_base64 = base64.b64encode(audio_data).decode("utf-8")

	return {
	"audio": audio_base64,
	"sample_rate": sample_rate,
	"format": "wav"
	}
	except Exception as e:
	return {"error": f"Error converting audio: {str(e)}"}

	def __call__(self, data: Dict) -> Dict:
	"""
	Main entry point for the handler.
	Args:
	data (Dict): Request data.
	Returns:
	Dict: Response data.
	"""
	# Ensure the model is initialized
	if not self.initialized:
	self.setup()

	# Process the request
	try:
	inputs = self.preprocess(data)
	outputs = self.inference(inputs)
	response = self.postprocess(outputs)
	return response
	except Exception as e:
	return {"error": f"Error processing request: {str(e)}"}