import gradio as gr import spaces import torch import soundfile as sf import numpy as np import librosa import math from transformers import MoonshineForConditionalGeneration, AutoProcessor device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny").to(device).to(torch_dtype) processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny") TOKENS_PER_SEC = 12.0 MIN_NEW_TOKENS = 48 MAX_NEW_TOKENS_CAP = 1600 @spaces.GPU def transcribe_audio(audio_file): if not audio_file: return "No audio provided." audio_array, sr = sf.read(audio_file) if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=1) target_sr = processor.feature_extractor.sampling_rate if sr != target_sr: audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr) inputs = processor(audio_array, sampling_rate=target_sr, return_tensors="pt") inputs = {k: v.to(device=device, dtype=torch_dtype) for k, v in inputs.items()} duration_sec = len(audio_array) / float(target_sr) max_new_tokens = min(MAX_NEW_TOKENS_CAP, max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC)))) generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens, no_repeat_ngram_size=4, repetition_penalty=1.05) return processor.decode(generated_ids[0], skip_special_tokens=True) theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="fuchsia", neutral_hue="slate").set(button_large_radius="*radius_sm") with gr.Blocks(theme=theme) as demo: gr.Markdown("## Moonshine Tiny STT - 27M Parameters") gr.HTML("""
VibeVoice Banner
""") with gr.Tabs(): with gr.TabItem("Upload Audio"): audio_file = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File") output_text1 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True) upload_button = gr.Button("Transcribe Uploaded Audio") upload_button.click(fn=transcribe_audio, inputs=audio_file, outputs=output_text1) with gr.TabItem("Record Audio"): audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio") output_text2 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True) record_button = gr.Button("Transcribe Recorded Audio") record_button.click(fn=transcribe_audio, inputs=audio_mic, outputs=output_text2) gr.Markdown(""" ### Instructions: 1. Choose either 'Upload Audio' or 'Record Audio' tab 2. Upload an audio file or record using your microphone 3. Click the respective 'Transcribe' button 4. Wait for the transcription to appear """) if __name__ == "__main__": demo.launch()