Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import json | |
| import time | |
| import os | |
| from typing import Optional | |
| # Configuration | |
| SUPPORTED_FORMATS = ['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac', '.webm'] | |
| MAX_FILE_SIZE_MB = 25 | |
| # Language options for Whisper | |
| LANGUAGE_OPTIONS = [ | |
| ("🌍 Auto-rilevamento", "auto"), | |
| ("🇮🇹 Italiano", "it"), | |
| ("🇺🇸 English", "en"), | |
| ("🇪🇸 Español", "es"), | |
| ("🇫🇷 Français", "fr"), | |
| ("🇩🇪 Deutsch", "de"), | |
| ("🇵🇹 Português", "pt"), | |
| ("🇷🇺 Русский", "ru"), | |
| ("🇨🇳 中文", "zh"), | |
| ("🇯🇵 日本語", "ja"), | |
| ("🇰🇷 한국어", "ko"), | |
| ("🇳🇱 Nederlands", "nl"), | |
| ("🇸🇦 العربية", "ar"), | |
| ("🇮🇳 हिन्दी", "hi"), | |
| ("🇹🇷 Türkçe", "tr"), | |
| ("🇵🇱 Polski", "pl"), | |
| ("🇸🇪 Svenska", "sv"), | |
| ("🇳🇴 Norsk", "no"), | |
| ("🇩🇰 Dansk", "da"), | |
| ("🇫🇮 Suomi", "fi"), | |
| ("🇬🇷 Ελληνικά", "el"), | |
| ("🇮🇱 עברית", "he"), | |
| ("🇹🇭 ไทย", "th"), | |
| ("🇻🇳 Tiếng Việt", "vi"), | |
| ("🇮🇩 Bahasa Indonesia", "id"), | |
| ("🇲🇾 Bahasa Melayu", "ms"), | |
| ("🇺🇦 Українська", "uk"), | |
| ("🇨🇿 Čeština", "cs"), | |
| ("🇭🇺 Magyar", "hu"), | |
| ("🇷🇴 Română", "ro"), | |
| ("🇧🇬 Български", "bg"), | |
| ("🇭🇷 Hrvatski", "hr"), | |
| ("🇸🇰 Slovenčina", "sk"), | |
| ("🇸🇮 Slovenščina", "sl"), | |
| ] | |
| def transcribe_with_groq(audio_file, api_key, enable_diarization=False, language="auto"): | |
| """Transcribe using Groq's free Whisper API with basic speaker detection""" | |
| try: | |
| url = "https://api.groq.com/openai/v1/audio/transcriptions" | |
| with open(audio_file, "rb") as f: | |
| data = { | |
| "model": (None, "whisper-large-v3"), | |
| "response_format": (None, "verbose_json"), | |
| "timestamp_granularities[]": (None, "segment") | |
| } | |
| # Add language parameter if not auto | |
| if language != "auto": | |
| data["language"] = (None, language) | |
| files = {"file": f} | |
| headers = {"Authorization": f"Bearer {api_key}"} | |
| response = requests.post(url, files=files, data=data, headers=headers, timeout=60) | |
| if response.status_code == 200: | |
| result = response.json() | |
| # Extract text and segments | |
| text = result.get("text", "") | |
| segments = result.get("segments", []) | |
| # Apply speaker detection if enabled | |
| if enable_diarization and segments: | |
| formatted_text = format_with_speakers_groq(segments) | |
| return {"success": True, "text": formatted_text, "segments": segments} | |
| else: | |
| return {"success": True, "text": text, "segments": segments} | |
| else: | |
| return {"success": False, "error": f"Groq API error: {response.status_code}"} | |
| except Exception as e: | |
| return {"success": False, "error": f"Groq request failed: {str(e)}"} | |
| def format_with_speakers_groq(segments): | |
| """Format Groq segments with simple speaker detection based on silence gaps""" | |
| if not segments: | |
| return "" | |
| formatted_text = "" | |
| current_speaker = 1 | |
| last_end_time = 0 | |
| for i, segment in enumerate(segments): | |
| start_time = segment.get("start", 0) | |
| text = segment.get("text", "").strip() | |
| if not text: | |
| continue | |
| # Detect speaker change based on silence gap (> 2 seconds) | |
| if start_time - last_end_time > 2.0 and i > 0: | |
| current_speaker = 2 if current_speaker == 1 else 1 | |
| # Format timestamp | |
| minutes = int(start_time // 60) | |
| seconds = int(start_time % 60) | |
| timestamp = f"[{minutes:02d}:{seconds:02d}]" | |
| formatted_text += f"\n**Interlocutore {current_speaker}** {timestamp}: {text}" | |
| last_end_time = segment.get("end", start_time) | |
| return formatted_text.strip() | |
| def transcribe_with_assemblyai(audio_file, api_key, enable_diarization=False, language="auto"): | |
| """Transcribe using AssemblyAI's API with advanced speaker diarization""" | |
| try: | |
| # Step 1: Upload file | |
| headers = {"authorization": api_key} | |
| with open(audio_file, "rb") as f: | |
| response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f) | |
| if response.status_code != 200: | |
| return {"success": False, "error": f"Upload failed: {response.status_code}"} | |
| upload_url = response.json()["upload_url"] | |
| # Step 2: Request transcription with diarization and language | |
| data = { | |
| "audio_url": upload_url, | |
| "speaker_labels": enable_diarization, | |
| "speakers_expected": 2 if enable_diarization else None | |
| } | |
| # Add language detection or specific language | |
| if language == "auto": | |
| data["language_detection"] = True | |
| else: | |
| data["language_code"] = language | |
| response = requests.post("https://api.assemblyai.com/v2/transcript", json=data, headers=headers) | |
| if response.status_code != 200: | |
| return {"success": False, "error": f"Transcription request failed: {response.status_code}"} | |
| transcript_id = response.json()["id"] | |
| # Step 3: Poll for results | |
| polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}" | |
| for _ in range(60): # Wait up to 5 minutes | |
| response = requests.get(polling_url, headers=headers) | |
| result = response.json() | |
| if result["status"] == "completed": | |
| # Format with speakers if diarization is enabled | |
| if enable_diarization and "utterances" in result: | |
| formatted_text = format_with_speakers_assemblyai(result["utterances"]) | |
| return {"success": True, "text": formatted_text} | |
| else: | |
| return {"success": True, "text": result["text"]} | |
| elif result["status"] == "error": | |
| return {"success": False, "error": f"Transcription failed: {result.get('error', 'Unknown error')}"} | |
| time.sleep(5) | |
| return {"success": False, "error": "Transcription timeout"} | |
| except Exception as e: | |
| return {"success": False, "error": f"AssemblyAI request failed: {str(e)}"} | |
| def format_with_speakers_assemblyai(utterances): | |
| """Format AssemblyAI utterances with speaker labels""" | |
| if not utterances: | |
| return "" | |
| formatted_text = "" | |
| for utterance in utterances: | |
| speaker = utterance.get("speaker", "Unknown") | |
| text = utterance.get("text", "").strip() | |
| start_ms = utterance.get("start", 0) | |
| if not text: | |
| continue | |
| # Convert milliseconds to minutes:seconds | |
| start_seconds = start_ms / 1000 | |
| minutes = int(start_seconds // 60) | |
| seconds = int(start_seconds % 60) | |
| timestamp = f"[{minutes:02d}:{seconds:02d}]" | |
| # Map speaker labels to Italian (A=1, B=2, C=3, etc.) | |
| if speaker.isalpha(): | |
| speaker_num = ord(speaker.upper()) - ord('A') + 1 | |
| speaker_label = f"Interlocutore {speaker_num}" | |
| else: | |
| speaker_label = f"Interlocutore {speaker}" | |
| formatted_text += f"\n**{speaker_label}** {timestamp}: {text}" | |
| return formatted_text.strip() | |
| def transcribe_with_openai(audio_file, api_key, language="auto"): | |
| """Transcribe using OpenAI's Whisper API (no diarization support)""" | |
| try: | |
| url = "https://api.openai.com/v1/audio/transcriptions" | |
| with open(audio_file, "rb") as f: | |
| data = { | |
| "model": (None, "whisper-1"), | |
| } | |
| # Add language parameter if not auto | |
| if language != "auto": | |
| data["language"] = (None, language) | |
| files = {"file": f} | |
| headers = { | |
| "Authorization": f"Bearer {api_key}" | |
| } | |
| response = requests.post(url, files=files, data=data, headers=headers, timeout=60) | |
| if response.status_code == 200: | |
| result = response.json() | |
| return {"success": True, "text": result.get("text", "")} | |
| else: | |
| return {"success": False, "error": f"OpenAI API error: {response.status_code}"} | |
| except Exception as e: | |
| return {"success": False, "error": f"OpenAI request failed: {str(e)}"} | |
| def transcribe_with_deepgram(audio_file, api_key): | |
| """Transcribe using Deepgram's API (no diarization implemented)""" | |
| try: | |
| url = "https://api.deepgram.com/v1/listen" | |
| with open(audio_file, "rb") as f: | |
| audio_data = f.read() | |
| headers = { | |
| "Authorization": f"Token {api_key}", | |
| "Content-Type": "audio/wav" | |
| } | |
| response = requests.post(url, data=audio_data, headers=headers, timeout=60) | |
| if response.status_code == 200: | |
| result = response.json() | |
| text = result.get("results", {}).get("channels", [{}])[0].get("alternatives", [{}])[0].get("transcript", "") | |
| return {"success": True, "text": text} | |
| else: | |
| return {"success": False, "error": f"Deepgram API error: {response.status_code}"} | |
| except Exception as e: | |
| return {"success": False, "error": f"Deepgram request failed: {str(e)}"} | |
| def transcribe_audio(audio_file, service_choice, api_key, enable_diarization, language_choice, progress=gr.Progress()): | |
| """Main transcription function for audio files only""" | |
| if audio_file is None: | |
| return "❌ Carica un file audio.", "" | |
| # Check API key | |
| if not api_key or api_key.strip() == "" or "your-api-key" in api_key.lower(): | |
| return f"""❌ Chiave API richiesta per {service_choice}. | |
| **Chiavi API Gratuite Disponibili:** | |
| 🚀 **Groq (Consigliato - Veloce e Gratis):** | |
| 1. Vai su: https://console.groq.com/ | |
| 2. Registrati (gratis) | |
| 3. Ottieni la chiave API dalla dashboard | |
| 4. Tier gratuito: limiti molto generosi | |
| 🎯 **AssemblyAI (Buon Tier Gratuito):** | |
| 1. Vai su: https://www.assemblyai.com/ | |
| 2. Registrati (gratis) | |
| 3. Ottieni la chiave API | |
| 4. Tier gratuito: 5 ore/mese | |
| 💰 **OpenAI (A pagamento ma affidabile):** | |
| 1. Vai su: https://platform.openai.com/ | |
| 2. Aggiungi metodo di pagamento | |
| 3. Ottieni la chiave API | |
| 4. Costo: ~$0.006/minuto | |
| 🔊 **Deepgram (Prova gratuita):** | |
| 1. Vai su: https://deepgram.com/ | |
| 2. Registrati (prova gratuita) | |
| 3. Ottieni la chiave API | |
| 4. Tier gratuito: credito $200""", "" | |
| # Check file size | |
| file_size = os.path.getsize(audio_file) | |
| file_size_mb = file_size / (1024 * 1024) | |
| if file_size_mb > MAX_FILE_SIZE_MB: | |
| return f"❌ File troppo grande ({file_size_mb:.1f} MB). Dimensione massima: {MAX_FILE_SIZE_MB} MB.", "" | |
| # Check file format | |
| file_extension = os.path.splitext(audio_file)[1].lower() | |
| if file_extension not in SUPPORTED_FORMATS: | |
| return f"❌ Formato non supportato: {file_extension}\n\nFormati supportati: {', '.join(SUPPORTED_FORMATS)}", "" | |
| progress(0.1, desc="Iniziando trascrizione...") | |
| start_time = time.time() | |
| try: | |
| # Choose transcription service | |
| if service_choice == "Groq (Gratis e Veloce)": | |
| progress(0.3, desc="Trascrivendo con Groq...") | |
| result = transcribe_with_groq(audio_file, api_key, enable_diarization, language_choice) | |
| elif service_choice == "AssemblyAI (Tier Gratuito)": | |
| progress(0.3, desc="Caricando su AssemblyAI...") | |
| result = transcribe_with_assemblyai(audio_file, api_key, enable_diarization, language_choice) | |
| elif service_choice == "OpenAI Whisper (A Pagamento)": | |
| if enable_diarization: | |
| return "❌ OpenAI Whisper non supporta il riconoscimento interlocutori. Usa Groq o AssemblyAI per questa funzione.", "" | |
| progress(0.3, desc="Trascrivendo con OpenAI...") | |
| result = transcribe_with_openai(audio_file, api_key, language_choice) | |
| elif service_choice == "Deepgram (Prova Gratuita)": | |
| if enable_diarization: | |
| return "❌ Riconoscimento interlocutori non implementato per Deepgram. Usa Groq o AssemblyAI.", "" | |
| progress(0.3, desc="Trascrivendo con Deepgram...") | |
| result = transcribe_with_deepgram(audio_file, api_key) | |
| else: | |
| return "❌ Servizio non valido.", "" | |
| progress(0.8, desc="Elaborando risultato...") | |
| if not result["success"]: | |
| error_msg = result["error"] | |
| if "401" in error_msg or "403" in error_msg: | |
| return f"❌ Chiave API non valida per {service_choice}. Controlla la tua chiave API.", "" | |
| else: | |
| return f"❌ {error_msg}", "" | |
| transcription = result["text"] | |
| processing_time = time.time() - start_time | |
| audio_duration = estimate_audio_duration(file_size, file_extension) | |
| # Create summary | |
| diarization_status = "✅ Attivo" if enable_diarization else "❌ Disattivo" | |
| language_display = "🌍 Auto-rilevamento" if language_choice == "auto" else f"🗣️ {language_choice.upper()}" | |
| summary = f"""✅ **Trascrizione Completata!** | |
| 📊 **Statistiche:** | |
| • Servizio: {service_choice} | |
| • Dimensione file: {file_size_mb:.1f} MB | |
| • Tempo elaborazione: {processing_time:.1f} secondi | |
| • Durata audio stimata: ~{audio_duration:.0f} secondi | |
| • Fattore velocità: ~{audio_duration/processing_time:.1f}x tempo reale | |
| 🔧 **Impostazioni:** | |
| • Chiave API: {"✅ Fornita" if api_key else "❌ Mancante"} | |
| • Riconoscimento Interlocutori: {diarization_status} | |
| • Lingua: {language_display} | |
| • Formato: {file_extension.upper()} | |
| """ | |
| progress(1.0, desc="Completato!") | |
| return transcription.strip(), summary | |
| except Exception as e: | |
| return f"❌ Errore durante la trascrizione: {str(e)}", "" | |
| def estimate_audio_duration(file_size_bytes, file_extension): | |
| """Rough estimate of audio duration based on file size""" | |
| bitrate_estimates = { | |
| '.mp3': 128, '.wav': 1411, '.m4a': 128, '.aac': 128, | |
| '.ogg': 128, '.flac': 800, '.webm': 128 | |
| } | |
| bitrate = bitrate_estimates.get(file_extension, 128) | |
| duration = (file_size_bytes * 8) / (bitrate * 1000) | |
| return max(duration, 10) | |
| def get_service_info(): | |
| """Return information about different services""" | |
| return """ | |
| ## 🚀 **Groq (Consigliato)** | |
| - **Costo:** Gratuito con limiti generosi | |
| - **Velocità:** Molto veloce (inferenza ottimizzata) | |
| - **Modello:** Whisper Large V3 | |
| - **Riconoscimento Interlocutori:** ✅ Basico (basato su pause) | |
| - **Setup:** Registrazione rapida su console.groq.com | |
| ## 🎯 **AssemblyAI** | |
| - **Costo:** Tier gratuito (5 ore/mese) | |
| - **Velocità:** Veloce | |
| - **Riconoscimento Interlocutori:** ✅ Avanzato (AI) | |
| - **Funzioni:** Buona accuratezza, rilevamento parlanti | |
| - **Setup:** Registrati su assemblyai.com | |
| ## 💰 **OpenAI** | |
| - **Costo:** $0.006 per minuto (~$0.36/ora) | |
| - **Velocità:** Buona | |
| - **Modello:** Whisper ufficiale | |
| - **Riconoscimento Interlocutori:** ❌ Non supportato | |
| - **Setup:** Richiede metodo di pagamento | |
| ## 🔊 **Deepgram** | |
| - **Costo:** Prova gratuita (credito $200) | |
| - **Velocità:** Molto veloce (capace di tempo reale) | |
| - **Riconoscimento Interlocutori:** ❌ Non implementato | |
| - **Funzioni:** Buono per trascrizione live | |
| - **Setup:** Registrati su deepgram.com | |
| """ | |
| # Create the Gradio interface | |
| title = "🎙️ Trascrizione Audio con Riconoscimento Interlocutori" | |
| description = """ | |
| Trascrizione audio professionale con riconoscimento automatico dei diversi parlanti. | |
| Usa API affidabili come Groq (gratuito) o AssemblyAI per identificare "Interlocutore 1", "Interlocutore 2", ecc. | |
| **✨ Caratteristiche:** | |
| • Riconoscimento interlocutori automatico | |
| • Modelli Whisper di alta qualità | |
| • Timestamp precisi per ogni intervento | |
| • Supporto per tutti i formati audio comuni | |
| """ | |
| css = """ | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| with gr.Blocks(title=title, theme=gr.themes.Soft(), css=css) as demo: | |
| gr.Markdown(f"# {title}") | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Service selection | |
| with gr.Group(): | |
| gr.Markdown("### 🔧 Configurazione Servizio") | |
| service_choice = gr.Dropdown( | |
| choices=[ | |
| "Groq (Gratis e Veloce)", | |
| "AssemblyAI (Tier Gratuito)", | |
| "OpenAI Whisper (A Pagamento)", | |
| "Deepgram (Prova Gratuita)" | |
| ], | |
| value="Groq (Gratis e Veloce)", | |
| label="Servizio di Trascrizione", | |
| info="Groq consigliato per la migliore esperienza gratuita" | |
| ) | |
| api_key_input = gr.Textbox( | |
| label="Chiave API", | |
| placeholder="Inserisci qui la tua chiave API", | |
| type="password", | |
| info="Richiesta per tutti i servizi" | |
| ) | |
| # Speaker detection option | |
| diarization_checkbox = gr.Checkbox( | |
| label="🎭 Riconoscimento Interlocutori", | |
| value=True, | |
| info="Identifica diversi parlanti come Interlocutore 1, 2, ecc." | |
| ) | |
| # Language selection | |
| language_dropdown = gr.Dropdown( | |
| choices=LANGUAGE_OPTIONS, | |
| value="auto", | |
| label="🌍 Lingua Audio", | |
| info="Seleziona la lingua dell'audio per migliorare l'accuratezza" | |
| ) | |
| with gr.Accordion("ℹ️ Come ottenere le chiavi API", open=False): | |
| gr.Markdown(get_service_info()) | |
| # Audio input | |
| with gr.Group(): | |
| gr.Markdown("### 🎵 File Audio") | |
| audio_input = gr.Audio( | |
| type="filepath", | |
| label="Carica File Audio" | |
| ) | |
| transcribe_btn = gr.Button( | |
| "🚀 Trascrivi Audio", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Tips | |
| with gr.Group(): | |
| gr.Markdown("""### 💡 Avvio Rapido | |
| **Consigliato: Groq (Gratuito)** | |
| 1. Vai su [console.groq.com](https://console.groq.com/) | |
| 2. Registrati (gratis, no carta di credito) | |
| 3. Copia la chiave API | |
| 4. Carica audio e trascrivi! | |
| **🎭 Riconoscimento Interlocutori:** | |
| • ✅ **Groq**: Rilevamento base (basato su pause) | |
| • ✅ **AssemblyAI**: Rilevamento avanzato (AI) | |
| • ❌ **OpenAI/Deepgram**: Non supportato | |
| **🌍 Selezione Lingua:** | |
| • **Auto-rilevamento**: Lascia che l'AI detecti la lingua | |
| • **Lingua specifica**: Migliora accuratezza e velocità | |
| • **35+ lingue supportate**: Italiano, English, Español, etc. | |
| **Formati supportati:** | |
| MP3, WAV, M4A, AAC, OGG, FLAC, WebM | |
| """) | |
| with gr.Column(scale=2): | |
| # Output | |
| with gr.Group(): | |
| gr.Markdown("### 📝 Risultato Trascrizione") | |
| output_text = gr.Textbox( | |
| label="Testo Trascritto", | |
| placeholder="La tua trascrizione apparirà qui...", | |
| lines=20, | |
| max_lines=40, | |
| show_copy_button=True | |
| ) | |
| summary_output = gr.Textbox( | |
| label="Riepilogo Elaborazione", | |
| lines=12, | |
| max_lines=20 | |
| ) | |
| # Event handlers | |
| transcribe_btn.click( | |
| fn=transcribe_audio, | |
| inputs=[audio_input, service_choice, api_key_input, diarization_checkbox, language_dropdown], | |
| outputs=[output_text, summary_output] | |
| ) | |
| # Example output | |
| with gr.Accordion("📋 Esempio di Output con Interlocutori", open=False): | |
| gr.Markdown(""" | |
| **Esempio di trascrizione con riconoscimento interlocutori:** | |
| ``` | |
| **Interlocutore 1** [00:05]: Ciao, come stai oggi? | |
| **Interlocutore 2** [00:08]: Tutto bene, grazie! E tu come va il lavoro? | |
| **Interlocutore 1** [00:12]: Abbastanza bene, stiamo lavorando su un nuovo progetto molto interessante. | |
| **Interlocutore 2** [00:18]: Davvero? Di cosa si tratta? | |
| **Interlocutore 1** [00:22]: È un sistema di trascrizione automatica con riconoscimento parlanti. | |
| ``` | |
| **Formato:** | |
| - `**Interlocutore N**`: Identificazione del parlante | |
| - `[MM:SS]`: Timestamp dell'intervento | |
| - Testo trascritto con punteggiatura automatica | |
| """) | |
| # Footer with links | |
| gr.Markdown(""" | |
| --- | |
| **🔗 Ottieni Chiavi API:** | |
| [Groq Console](https://console.groq.com/) • [AssemblyAI](https://www.assemblyai.com/) • [OpenAI Platform](https://platform.openai.com/) • [Deepgram](https://deepgram.com/) | |
| **📚 Info:** Usa API affidabili per trascrizione • Groq e AssemblyAI supportano riconoscimento interlocutori • Supporto per 35+ lingue • Tutti i servizi usano modelli Whisper | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| debug=True, | |
| share=False, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |