vinyovy's picture
Update app.py
b2da9b8 verified
import os
import torch
import librosa
import gradio as gr
from huggingface_hub import login
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Login using the secret token for gated model access
hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
if hf_token:
login(token=hf_token)
MODEL_ID = "sulaimank/xlsr-luganda-400hr-all"
print("Loading model... this may take a minute on first run.")
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
model.eval()
def transcribe(audio):
if audio is None:
return "Please upload or record an audio file."
sr, data = audio
if data.ndim > 1:
data = data.mean(axis=1)
data = data.astype("float32")
if sr != 16000:
data = librosa.resample(data, orig_sr=sr, target_sr=16000)
if data.max() > 1.0:
data = data / 32768.0
inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.lower()
with gr.Blocks(title="Luganda Speech Transcriber", theme=gr.themes.Soft()) as demo:
gr.Markdown("# Luganda Speech Transcriber\nTranscribe spoken **Luganda** audio to text.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Audio Input", sources=["upload", "microphone"], type="numpy")
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Transcription", placeholder="Your transcription will appear here...", lines=8)
transcribe_btn.click(fn=transcribe, inputs=audio_input, outputs=output_text)
audio_input.change(fn=transcribe, inputs=audio_input, outputs=output_text)
if __name__ == "__main__":
demo.launch()