Valentina9502's picture
Update app.py
66c2494 verified
import gradio as gr
import requests
import json
import time
import os
from typing import Optional
# Configuration
SUPPORTED_FORMATS = ['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac', '.webm']
MAX_FILE_SIZE_MB = 25
# Language options for Whisper
LANGUAGE_OPTIONS = [
("🌍 Auto-rilevamento", "auto"),
("🇮🇹 Italiano", "it"),
("🇺🇸 English", "en"),
("🇪🇸 Español", "es"),
("🇫🇷 Français", "fr"),
("🇩🇪 Deutsch", "de"),
("🇵🇹 Português", "pt"),
("🇷🇺 Русский", "ru"),
("🇨🇳 中文", "zh"),
("🇯🇵 日本語", "ja"),
("🇰🇷 한국어", "ko"),
("🇳🇱 Nederlands", "nl"),
("🇸🇦 العربية", "ar"),
("🇮🇳 हिन्दी", "hi"),
("🇹🇷 Türkçe", "tr"),
("🇵🇱 Polski", "pl"),
("🇸🇪 Svenska", "sv"),
("🇳🇴 Norsk", "no"),
("🇩🇰 Dansk", "da"),
("🇫🇮 Suomi", "fi"),
("🇬🇷 Ελληνικά", "el"),
("🇮🇱 עברית", "he"),
("🇹🇭 ไทย", "th"),
("🇻🇳 Tiếng Việt", "vi"),
("🇮🇩 Bahasa Indonesia", "id"),
("🇲🇾 Bahasa Melayu", "ms"),
("🇺🇦 Українська", "uk"),
("🇨🇿 Čeština", "cs"),
("🇭🇺 Magyar", "hu"),
("🇷🇴 Română", "ro"),
("🇧🇬 Български", "bg"),
("🇭🇷 Hrvatski", "hr"),
("🇸🇰 Slovenčina", "sk"),
("🇸🇮 Slovenščina", "sl"),
]
def transcribe_with_groq(audio_file, api_key, enable_diarization=False, language="auto"):
"""Transcribe using Groq's free Whisper API with basic speaker detection"""
try:
url = "https://api.groq.com/openai/v1/audio/transcriptions"
with open(audio_file, "rb") as f:
data = {
"model": (None, "whisper-large-v3"),
"response_format": (None, "verbose_json"),
"timestamp_granularities[]": (None, "segment")
}
# Add language parameter if not auto
if language != "auto":
data["language"] = (None, language)
files = {"file": f}
headers = {"Authorization": f"Bearer {api_key}"}
response = requests.post(url, files=files, data=data, headers=headers, timeout=60)
if response.status_code == 200:
result = response.json()
# Extract text and segments
text = result.get("text", "")
segments = result.get("segments", [])
# Apply speaker detection if enabled
if enable_diarization and segments:
formatted_text = format_with_speakers_groq(segments)
return {"success": True, "text": formatted_text, "segments": segments}
else:
return {"success": True, "text": text, "segments": segments}
else:
return {"success": False, "error": f"Groq API error: {response.status_code}"}
except Exception as e:
return {"success": False, "error": f"Groq request failed: {str(e)}"}
def format_with_speakers_groq(segments):
"""Format Groq segments with simple speaker detection based on silence gaps"""
if not segments:
return ""
formatted_text = ""
current_speaker = 1
last_end_time = 0
for i, segment in enumerate(segments):
start_time = segment.get("start", 0)
text = segment.get("text", "").strip()
if not text:
continue
# Detect speaker change based on silence gap (> 2 seconds)
if start_time - last_end_time > 2.0 and i > 0:
current_speaker = 2 if current_speaker == 1 else 1
# Format timestamp
minutes = int(start_time // 60)
seconds = int(start_time % 60)
timestamp = f"[{minutes:02d}:{seconds:02d}]"
formatted_text += f"\n**Interlocutore {current_speaker}** {timestamp}: {text}"
last_end_time = segment.get("end", start_time)
return formatted_text.strip()
def transcribe_with_assemblyai(audio_file, api_key, enable_diarization=False, language="auto"):
"""Transcribe using AssemblyAI's API with advanced speaker diarization"""
try:
# Step 1: Upload file
headers = {"authorization": api_key}
with open(audio_file, "rb") as f:
response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
if response.status_code != 200:
return {"success": False, "error": f"Upload failed: {response.status_code}"}
upload_url = response.json()["upload_url"]
# Step 2: Request transcription with diarization and language
data = {
"audio_url": upload_url,
"speaker_labels": enable_diarization,
"speakers_expected": 2 if enable_diarization else None
}
# Add language detection or specific language
if language == "auto":
data["language_detection"] = True
else:
data["language_code"] = language
response = requests.post("https://api.assemblyai.com/v2/transcript", json=data, headers=headers)
if response.status_code != 200:
return {"success": False, "error": f"Transcription request failed: {response.status_code}"}
transcript_id = response.json()["id"]
# Step 3: Poll for results
polling_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
for _ in range(60): # Wait up to 5 minutes
response = requests.get(polling_url, headers=headers)
result = response.json()
if result["status"] == "completed":
# Format with speakers if diarization is enabled
if enable_diarization and "utterances" in result:
formatted_text = format_with_speakers_assemblyai(result["utterances"])
return {"success": True, "text": formatted_text}
else:
return {"success": True, "text": result["text"]}
elif result["status"] == "error":
return {"success": False, "error": f"Transcription failed: {result.get('error', 'Unknown error')}"}
time.sleep(5)
return {"success": False, "error": "Transcription timeout"}
except Exception as e:
return {"success": False, "error": f"AssemblyAI request failed: {str(e)}"}
def format_with_speakers_assemblyai(utterances):
"""Format AssemblyAI utterances with speaker labels"""
if not utterances:
return ""
formatted_text = ""
for utterance in utterances:
speaker = utterance.get("speaker", "Unknown")
text = utterance.get("text", "").strip()
start_ms = utterance.get("start", 0)
if not text:
continue
# Convert milliseconds to minutes:seconds
start_seconds = start_ms / 1000
minutes = int(start_seconds // 60)
seconds = int(start_seconds % 60)
timestamp = f"[{minutes:02d}:{seconds:02d}]"
# Map speaker labels to Italian (A=1, B=2, C=3, etc.)
if speaker.isalpha():
speaker_num = ord(speaker.upper()) - ord('A') + 1
speaker_label = f"Interlocutore {speaker_num}"
else:
speaker_label = f"Interlocutore {speaker}"
formatted_text += f"\n**{speaker_label}** {timestamp}: {text}"
return formatted_text.strip()
def transcribe_with_openai(audio_file, api_key, language="auto"):
"""Transcribe using OpenAI's Whisper API (no diarization support)"""
try:
url = "https://api.openai.com/v1/audio/transcriptions"
with open(audio_file, "rb") as f:
data = {
"model": (None, "whisper-1"),
}
# Add language parameter if not auto
if language != "auto":
data["language"] = (None, language)
files = {"file": f}
headers = {
"Authorization": f"Bearer {api_key}"
}
response = requests.post(url, files=files, data=data, headers=headers, timeout=60)
if response.status_code == 200:
result = response.json()
return {"success": True, "text": result.get("text", "")}
else:
return {"success": False, "error": f"OpenAI API error: {response.status_code}"}
except Exception as e:
return {"success": False, "error": f"OpenAI request failed: {str(e)}"}
def transcribe_with_deepgram(audio_file, api_key):
"""Transcribe using Deepgram's API (no diarization implemented)"""
try:
url = "https://api.deepgram.com/v1/listen"
with open(audio_file, "rb") as f:
audio_data = f.read()
headers = {
"Authorization": f"Token {api_key}",
"Content-Type": "audio/wav"
}
response = requests.post(url, data=audio_data, headers=headers, timeout=60)
if response.status_code == 200:
result = response.json()
text = result.get("results", {}).get("channels", [{}])[0].get("alternatives", [{}])[0].get("transcript", "")
return {"success": True, "text": text}
else:
return {"success": False, "error": f"Deepgram API error: {response.status_code}"}
except Exception as e:
return {"success": False, "error": f"Deepgram request failed: {str(e)}"}
def transcribe_audio(audio_file, service_choice, api_key, enable_diarization, language_choice, progress=gr.Progress()):
"""Main transcription function for audio files only"""
if audio_file is None:
return "❌ Carica un file audio.", ""
# Check API key
if not api_key or api_key.strip() == "" or "your-api-key" in api_key.lower():
return f"""❌ Chiave API richiesta per {service_choice}.
**Chiavi API Gratuite Disponibili:**
🚀 **Groq (Consigliato - Veloce e Gratis):**
1. Vai su: https://console.groq.com/
2. Registrati (gratis)
3. Ottieni la chiave API dalla dashboard
4. Tier gratuito: limiti molto generosi
🎯 **AssemblyAI (Buon Tier Gratuito):**
1. Vai su: https://www.assemblyai.com/
2. Registrati (gratis)
3. Ottieni la chiave API
4. Tier gratuito: 5 ore/mese
💰 **OpenAI (A pagamento ma affidabile):**
1. Vai su: https://platform.openai.com/
2. Aggiungi metodo di pagamento
3. Ottieni la chiave API
4. Costo: ~$0.006/minuto
🔊 **Deepgram (Prova gratuita):**
1. Vai su: https://deepgram.com/
2. Registrati (prova gratuita)
3. Ottieni la chiave API
4. Tier gratuito: credito $200""", ""
# Check file size
file_size = os.path.getsize(audio_file)
file_size_mb = file_size / (1024 * 1024)
if file_size_mb > MAX_FILE_SIZE_MB:
return f"❌ File troppo grande ({file_size_mb:.1f} MB). Dimensione massima: {MAX_FILE_SIZE_MB} MB.", ""
# Check file format
file_extension = os.path.splitext(audio_file)[1].lower()
if file_extension not in SUPPORTED_FORMATS:
return f"❌ Formato non supportato: {file_extension}\n\nFormati supportati: {', '.join(SUPPORTED_FORMATS)}", ""
progress(0.1, desc="Iniziando trascrizione...")
start_time = time.time()
try:
# Choose transcription service
if service_choice == "Groq (Gratis e Veloce)":
progress(0.3, desc="Trascrivendo con Groq...")
result = transcribe_with_groq(audio_file, api_key, enable_diarization, language_choice)
elif service_choice == "AssemblyAI (Tier Gratuito)":
progress(0.3, desc="Caricando su AssemblyAI...")
result = transcribe_with_assemblyai(audio_file, api_key, enable_diarization, language_choice)
elif service_choice == "OpenAI Whisper (A Pagamento)":
if enable_diarization:
return "❌ OpenAI Whisper non supporta il riconoscimento interlocutori. Usa Groq o AssemblyAI per questa funzione.", ""
progress(0.3, desc="Trascrivendo con OpenAI...")
result = transcribe_with_openai(audio_file, api_key, language_choice)
elif service_choice == "Deepgram (Prova Gratuita)":
if enable_diarization:
return "❌ Riconoscimento interlocutori non implementato per Deepgram. Usa Groq o AssemblyAI.", ""
progress(0.3, desc="Trascrivendo con Deepgram...")
result = transcribe_with_deepgram(audio_file, api_key)
else:
return "❌ Servizio non valido.", ""
progress(0.8, desc="Elaborando risultato...")
if not result["success"]:
error_msg = result["error"]
if "401" in error_msg or "403" in error_msg:
return f"❌ Chiave API non valida per {service_choice}. Controlla la tua chiave API.", ""
else:
return f"❌ {error_msg}", ""
transcription = result["text"]
processing_time = time.time() - start_time
audio_duration = estimate_audio_duration(file_size, file_extension)
# Create summary
diarization_status = "✅ Attivo" if enable_diarization else "❌ Disattivo"
language_display = "🌍 Auto-rilevamento" if language_choice == "auto" else f"🗣️ {language_choice.upper()}"
summary = f"""✅ **Trascrizione Completata!**
📊 **Statistiche:**
• Servizio: {service_choice}
• Dimensione file: {file_size_mb:.1f} MB
• Tempo elaborazione: {processing_time:.1f} secondi
• Durata audio stimata: ~{audio_duration:.0f} secondi
• Fattore velocità: ~{audio_duration/processing_time:.1f}x tempo reale
🔧 **Impostazioni:**
• Chiave API: {"✅ Fornita" if api_key else "❌ Mancante"}
• Riconoscimento Interlocutori: {diarization_status}
• Lingua: {language_display}
• Formato: {file_extension.upper()}
"""
progress(1.0, desc="Completato!")
return transcription.strip(), summary
except Exception as e:
return f"❌ Errore durante la trascrizione: {str(e)}", ""
def estimate_audio_duration(file_size_bytes, file_extension):
"""Rough estimate of audio duration based on file size"""
bitrate_estimates = {
'.mp3': 128, '.wav': 1411, '.m4a': 128, '.aac': 128,
'.ogg': 128, '.flac': 800, '.webm': 128
}
bitrate = bitrate_estimates.get(file_extension, 128)
duration = (file_size_bytes * 8) / (bitrate * 1000)
return max(duration, 10)
def get_service_info():
"""Return information about different services"""
return """
## 🚀 **Groq (Consigliato)**
- **Costo:** Gratuito con limiti generosi
- **Velocità:** Molto veloce (inferenza ottimizzata)
- **Modello:** Whisper Large V3
- **Riconoscimento Interlocutori:** ✅ Basico (basato su pause)
- **Setup:** Registrazione rapida su console.groq.com
## 🎯 **AssemblyAI**
- **Costo:** Tier gratuito (5 ore/mese)
- **Velocità:** Veloce
- **Riconoscimento Interlocutori:** ✅ Avanzato (AI)
- **Funzioni:** Buona accuratezza, rilevamento parlanti
- **Setup:** Registrati su assemblyai.com
## 💰 **OpenAI**
- **Costo:** $0.006 per minuto (~$0.36/ora)
- **Velocità:** Buona
- **Modello:** Whisper ufficiale
- **Riconoscimento Interlocutori:** ❌ Non supportato
- **Setup:** Richiede metodo di pagamento
## 🔊 **Deepgram**
- **Costo:** Prova gratuita (credito $200)
- **Velocità:** Molto veloce (capace di tempo reale)
- **Riconoscimento Interlocutori:** ❌ Non implementato
- **Funzioni:** Buono per trascrizione live
- **Setup:** Registrati su deepgram.com
"""
# Create the Gradio interface
title = "🎙️ Trascrizione Audio con Riconoscimento Interlocutori"
description = """
Trascrizione audio professionale con riconoscimento automatico dei diversi parlanti.
Usa API affidabili come Groq (gratuito) o AssemblyAI per identificare "Interlocutore 1", "Interlocutore 2", ecc.
**✨ Caratteristiche:**
• Riconoscimento interlocutori automatico
• Modelli Whisper di alta qualità
• Timestamp precisi per ogni intervento
• Supporto per tutti i formati audio comuni
"""
css = """
.gradio-container {
max-width: 1200px !important;
}
"""
with gr.Blocks(title=title, theme=gr.themes.Soft(), css=css) as demo:
gr.Markdown(f"# {title}")
gr.Markdown(description)
with gr.Row():
with gr.Column(scale=1):
# Service selection
with gr.Group():
gr.Markdown("### 🔧 Configurazione Servizio")
service_choice = gr.Dropdown(
choices=[
"Groq (Gratis e Veloce)",
"AssemblyAI (Tier Gratuito)",
"OpenAI Whisper (A Pagamento)",
"Deepgram (Prova Gratuita)"
],
value="Groq (Gratis e Veloce)",
label="Servizio di Trascrizione",
info="Groq consigliato per la migliore esperienza gratuita"
)
api_key_input = gr.Textbox(
label="Chiave API",
placeholder="Inserisci qui la tua chiave API",
type="password",
info="Richiesta per tutti i servizi"
)
# Speaker detection option
diarization_checkbox = gr.Checkbox(
label="🎭 Riconoscimento Interlocutori",
value=True,
info="Identifica diversi parlanti come Interlocutore 1, 2, ecc."
)
# Language selection
language_dropdown = gr.Dropdown(
choices=LANGUAGE_OPTIONS,
value="auto",
label="🌍 Lingua Audio",
info="Seleziona la lingua dell'audio per migliorare l'accuratezza"
)
with gr.Accordion("ℹ️ Come ottenere le chiavi API", open=False):
gr.Markdown(get_service_info())
# Audio input
with gr.Group():
gr.Markdown("### 🎵 File Audio")
audio_input = gr.Audio(
type="filepath",
label="Carica File Audio"
)
transcribe_btn = gr.Button(
"🚀 Trascrivi Audio",
variant="primary",
size="lg"
)
# Tips
with gr.Group():
gr.Markdown("""### 💡 Avvio Rapido
**Consigliato: Groq (Gratuito)**
1. Vai su [console.groq.com](https://console.groq.com/)
2. Registrati (gratis, no carta di credito)
3. Copia la chiave API
4. Carica audio e trascrivi!
**🎭 Riconoscimento Interlocutori:**
• ✅ **Groq**: Rilevamento base (basato su pause)
• ✅ **AssemblyAI**: Rilevamento avanzato (AI)
• ❌ **OpenAI/Deepgram**: Non supportato
**🌍 Selezione Lingua:**
• **Auto-rilevamento**: Lascia che l'AI detecti la lingua
• **Lingua specifica**: Migliora accuratezza e velocità
• **35+ lingue supportate**: Italiano, English, Español, etc.
**Formati supportati:**
MP3, WAV, M4A, AAC, OGG, FLAC, WebM
""")
with gr.Column(scale=2):
# Output
with gr.Group():
gr.Markdown("### 📝 Risultato Trascrizione")
output_text = gr.Textbox(
label="Testo Trascritto",
placeholder="La tua trascrizione apparirà qui...",
lines=20,
max_lines=40,
show_copy_button=True
)
summary_output = gr.Textbox(
label="Riepilogo Elaborazione",
lines=12,
max_lines=20
)
# Event handlers
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, service_choice, api_key_input, diarization_checkbox, language_dropdown],
outputs=[output_text, summary_output]
)
# Example output
with gr.Accordion("📋 Esempio di Output con Interlocutori", open=False):
gr.Markdown("""
**Esempio di trascrizione con riconoscimento interlocutori:**
```
**Interlocutore 1** [00:05]: Ciao, come stai oggi?
**Interlocutore 2** [00:08]: Tutto bene, grazie! E tu come va il lavoro?
**Interlocutore 1** [00:12]: Abbastanza bene, stiamo lavorando su un nuovo progetto molto interessante.
**Interlocutore 2** [00:18]: Davvero? Di cosa si tratta?
**Interlocutore 1** [00:22]: È un sistema di trascrizione automatica con riconoscimento parlanti.
```
**Formato:**
- `**Interlocutore N**`: Identificazione del parlante
- `[MM:SS]`: Timestamp dell'intervento
- Testo trascritto con punteggiatura automatica
""")
# Footer with links
gr.Markdown("""
---
**🔗 Ottieni Chiavi API:**
[Groq Console](https://console.groq.com/) • [AssemblyAI](https://www.assemblyai.com/) • [OpenAI Platform](https://platform.openai.com/) • [Deepgram](https://deepgram.com/)
**📚 Info:** Usa API affidabili per trascrizione • Groq e AssemblyAI supportano riconoscimento interlocutori • Supporto per 35+ lingue • Tutti i servizi usano modelli Whisper
""")
if __name__ == "__main__":
demo.launch(
debug=True,
share=False,
server_name="0.0.0.0",
server_port=7860,
show_error=True
)