Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import spaces | |
| import torch | |
| import soundfile as sf | |
| import numpy as np | |
| import librosa | |
| import math | |
| from transformers import MoonshineForConditionalGeneration, AutoProcessor | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny").to(device).to(torch_dtype) | |
| processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny") | |
| TOKENS_PER_SEC = 12.0 | |
| MIN_NEW_TOKENS = 48 | |
| MAX_NEW_TOKENS_CAP = 1600 | |
| def transcribe_audio(audio_file): | |
| if not audio_file: | |
| return "No audio provided." | |
| audio_array, sr = sf.read(audio_file) | |
| if audio_array.ndim > 1: | |
| audio_array = np.mean(audio_array, axis=1) | |
| target_sr = processor.feature_extractor.sampling_rate | |
| if sr != target_sr: | |
| audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr) | |
| inputs = processor(audio_array, sampling_rate=target_sr, return_tensors="pt") | |
| inputs = {k: v.to(device=device, dtype=torch_dtype) for k, v in inputs.items()} | |
| duration_sec = len(audio_array) / float(target_sr) | |
| max_new_tokens = min(MAX_NEW_TOKENS_CAP, max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC)))) | |
| generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens, no_repeat_ngram_size=4, repetition_penalty=1.05) | |
| return processor.decode(generated_ids[0], skip_special_tokens=True) | |
| theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="fuchsia", neutral_hue="slate").set(button_large_radius="*radius_sm") | |
| with gr.Blocks(theme=theme) as demo: | |
| gr.Markdown("## Moonshine Tiny STT - 27M Parameters") | |
| gr.HTML(""" | |
| <div style="width: 100%; margin-bottom: 20px;"> | |
| <img src="https://huggingface.co/spaces/ACloudCenter/moonshine-tiny-STT/resolve/main/public/images/banner.png" | |
| style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);" | |
| alt="VibeVoice Banner"> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload Audio"): | |
| audio_file = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File") | |
| output_text1 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True) | |
| upload_button = gr.Button("Transcribe Uploaded Audio") | |
| upload_button.click(fn=transcribe_audio, inputs=audio_file, outputs=output_text1) | |
| with gr.TabItem("Record Audio"): | |
| audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio") | |
| output_text2 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=10, autoscroll=True) | |
| record_button = gr.Button("Transcribe Recorded Audio") | |
| record_button.click(fn=transcribe_audio, inputs=audio_mic, outputs=output_text2) | |
| gr.Markdown(""" | |
| ### Instructions: | |
| 1. Choose either 'Upload Audio' or 'Record Audio' tab | |
| 2. Upload an audio file or record using your microphone | |
| 3. Click the respective 'Transcribe' button | |
| 4. Wait for the transcription to appear | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |