import os import torch import librosa import gradio as gr from huggingface_hub import login from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor # Login using the secret token for gated model access hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN") if hf_token: login(token=hf_token) MODEL_ID = "sulaimank/xlsr-luganda-400hr-all" print("Loading model... this may take a minute on first run.") processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) model.eval() def transcribe(audio): if audio is None: return "Please upload or record an audio file." sr, data = audio if data.ndim > 1: data = data.mean(axis=1) data = data.astype("float32") if sr != 16000: data = librosa.resample(data, orig_sr=sr, target_sr=16000) if data.max() > 1.0: data = data / 32768.0 inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription.lower() with gr.Blocks(title="Luganda Speech Transcriber", theme=gr.themes.Soft()) as demo: gr.Markdown("# Luganda Speech Transcriber\nTranscribe spoken **Luganda** audio to text.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Audio Input", sources=["upload", "microphone"], type="numpy") transcribe_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Transcription", placeholder="Your transcription will appear here...", lines=8) transcribe_btn.click(fn=transcribe, inputs=audio_input, outputs=output_text) audio_input.change(fn=transcribe, inputs=audio_input, outputs=output_text) if __name__ == "__main__": demo.launch()