import gradio as gr from google import genai from google.genai import types from PIL import Image import os import io import uuid import threading import time from dotenv import load_dotenv load_dotenv() # --- CONFIGURATION CONFORME DOC --- MODELS = { "🧠 Gemini 3 Pro Preview (Recommended)": "gemini-3-pro-image-preview", "⚡ Gemini 2.5 Flash (Fast)": "gemini-2.5-flash-image" } # Valeurs exactes demandées par la documentation RATIOS = ["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"] RESOLUTIONS = ["1K", "2K", "4K"] # Uppercase 'K' mandatory per docs TEMP_CHAT_DIR = "temp_chat_images" os.makedirs(TEMP_CHAT_DIR, exist_ok=True) # --- UTILS --- def get_client(api_key): if not api_key: raise gr.Error("API Key manquante") return genai.Client(api_key=api_key) def safe_process_image(part): """Convertit les données inline brutes en Image PIL.""" try: if part.inline_data and hasattr(part.inline_data, 'data'): return Image.open(io.BytesIO(part.inline_data.data)) if hasattr(part, 'as_image'): img = part.as_image() if hasattr(img, 'image'): return img.image return img return None except Exception as e: print(f"⚠️ Image conversion error: {e}") return None def process_response(response): """Sépare le résultat final, le processus de pensée (Thinking Mode) et les sources.""" final_imgs, final_txt = [], "" thought_imgs, thought_txt = [], "" sources_html = None # <--- AJOUT : Conteneur pour les sources if not response or not response.parts: return final_imgs, final_txt, thought_imgs, thought_txt, sources_html print(f"\n--- RECEIVED ({len(response.parts)} parts) ---") # 1. Parsing du contenu (Images & Texte & Pensées) for i, part in enumerate(response.parts): is_thought = getattr(part, 'thought', False) if is_thought: if part.text: thought_txt += part.text + "\n" if part.inline_data: img = safe_process_image(part) if img: thought_imgs.append(img) else: if part.text: final_txt += part.text + "\n" if part.inline_data: img = safe_process_image(part) if img: final_imgs.append(img) # 2. Parsing du Grounding (Sources) - AJOUT # Les métadonnées sont au niveau du 'candidate', pas des 'parts' if response.candidates and response.candidates[0].grounding_metadata: gm = response.candidates[0].grounding_metadata if gm.search_entry_point and gm.search_entry_point.rendered_content: sources_html = gm.search_entry_point.rendered_content return final_imgs, final_txt, thought_imgs, thought_txt, sources_html # --- WORKER NETTOYAGE --- def cleanup_old_files(): """Supprime les fichiers vieux de plus de 1h toutes les 10 minutes.""" while True: try: now = time.time() cutoff = now - 3600 # 1 heure if os.path.exists(TEMP_CHAT_DIR): for filename in os.listdir(TEMP_CHAT_DIR): filepath = os.path.join(TEMP_CHAT_DIR, filename) if os.path.isfile(filepath): if os.path.getmtime(filepath) < cutoff: try: os.remove(filepath) print(f"🧹 Supprimé : {filename}") except Exception: pass except Exception as e: print(f"⚠️ Erreur worker : {e}") time.sleep(600) # --- BACKEND FUNCTIONS --- def update_api_key(new_key): if not new_key: return "⚠️ Clé invalide", None return "✅ Clé enregistrée pour cette session !", new_key def generate_studio(prompt, model_ui, ratio, resolution, grounding, user_api_key): """Text-to-Image standard respectant types.GenerateContentConfig""" cli = get_client(user_api_key) model_name = MODELS[model_ui] # Configuration stricte selon la doc img_conf = {"aspect_ratio": ratio} gen_conf = {"response_modalities": ["TEXT", "IMAGE"]} if "gemini-3" in model_name: img_conf["image_size"] = resolution # <--- AJOUT : Activation du Thinking Mode gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True) if grounding: gen_conf["tools"] = [{"google_search": {}}] gen_conf["image_config"] = types.ImageConfig(**img_conf) try: print(f"🚀 Sending request [T2I]...") response = cli.models.generate_content( model=model_name, contents=[prompt], config=types.GenerateContentConfig(**gen_conf) ) # Retourne maintenant 5 éléments (avec sources) return process_response(response) except Exception as e: raise gr.Error(f"API Error: {str(e)}") def generate_composition(prompt, files, model_ui, ratio, resolution, grounding, user_api_key): """Composition I2I (Supporte jusqu'à 14 images selon la doc)""" cli = get_client(user_api_key) model_name = MODELS[model_ui] if not files: raise gr.Error("No input images provided.") contents = [prompt] for p in files: try: contents.append(Image.open(p)) except: pass img_conf = {"aspect_ratio": ratio} gen_conf = {"response_modalities": ["TEXT", "IMAGE"]} if "gemini-3" in model_name: img_conf["image_size"] = resolution # <--- AJOUT : Activation du Thinking Mode aussi ici par sécurité gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True) # <--- AJOUT : Grounding if grounding: gen_conf["tools"] = [{"google_search": {}}] gen_conf["image_config"] = types.ImageConfig(**img_conf) try: print(f"🚀 Sending request [I2I]") response = cli.models.generate_content( model=model_name, contents=contents, config=types.GenerateContentConfig(**gen_conf) ) # Gestion des 5 valeurs de retour f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response) full_text = f_txt # Ajout des sources et pensées au texte principal pour cet onglet if sources: full_text += f"\n\n{sources}" if t_txt: full_text += f"\n\n{t_txt}" return f_imgs, full_text except Exception as e: raise gr.Error(f"Error: {str(e)}") # --- CHAT LOGIC --- def chat_respond(message, history, chat_history_data, img_input, model_ui, grounding, ratio, resolution, user_api_key): """Gestion du chat 'Stateless' conforme aux types Google GenAI""" if not user_api_key: raise gr.Error("API Key manquante") cli = get_client(user_api_key) model_name = MODELS[model_ui] tools = None thinking_conf = None # <--- AJOUT variable # Configuration Image img_conf = {"aspect_ratio": ratio} if "gemini-3" in model_name: img_conf["image_size"] = resolution # <--- AJOUT : Config Thinking thinking_conf = types.ThinkingConfig(include_thoughts=True) if grounding: tools = [{"google_search": {}}] # 1. Restauration de l'historique chat = cli.chats.create( model=model_name, config=types.GenerateContentConfig( response_modalities=['TEXT', 'IMAGE'], tools=tools, thinking_config=thinking_conf, image_config=types.ImageConfig(**img_conf) # <--- AJOUT Image Config dans Chat ), history=chat_history_data ) # 2. Préparation du contenu utilisateur send_contents = [message] if img_input: for img_path in img_input: send_contents.append(Image.open(img_path)) user_display_text = message if img_input: user_display_text += f"\n\n🖼️ *({len(img_input)} Images attached)*" user_message_obj = {"role": "user", "content": user_display_text} try: # 3. Envoi au modèle response = chat.send_message(send_contents) # Récupération des 5 valeurs f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response) # 4. Construction réponse UI bot_messages = [] if t_txt or t_imgs: thought_md = "🧠 **Model Thought Process:**\n" if t_txt: thought_md += f"> {t_txt}\n" if t_imgs: thought_md += f"*( + {len(t_imgs)} draft image(s) not displayed)*\n" thought_md += "---\n" bot_messages.append({"role": "assistant", "content": thought_md}) if f_txt: bot_messages.append({"role": "assistant", "content": f_txt}) # <--- AJOUT : Affichage des sources dans le chat if sources: bot_messages.append({"role": "assistant", "content": sources}) if f_imgs: for i, img in enumerate(f_imgs): unique_filename = f"chat_{uuid.uuid4()}_{i}.png" file_path = os.path.join(TEMP_CHAT_DIR, unique_filename) img.save(file_path) img_msg = {"path": file_path, "alt_text": "Generated Image"} bot_messages.append({"role": "assistant", "content": img_msg}) if not f_txt and not f_imgs and not t_txt and not sources: bot_messages.append({"role": "assistant", "content": "⚠️ *Empty response.*"}) # 5. Mise à jour de l'historique Gemini u_parts = [types.Part.from_text(text=message)] if img_input: for img_path in img_input: with open(img_path, "rb") as f: img_bytes = f.read() u_parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg")) user_content_obj = types.Content(role="user", parts=u_parts) model_content_obj = response.candidates[0].content current_data = chat_history_data if chat_history_data else [] new_gemini_history = current_data + [user_content_obj, model_content_obj] new_ui_history = history + [user_message_obj] + bot_messages return "", None, new_ui_history, new_gemini_history, f_imgs except Exception as e: err_msg = f"❌ Error: {str(e)}" bot_err_obj = {"role": "assistant", "content": err_msg} return "", None, history + [user_message_obj, bot_err_obj], chat_history_data, [] def clear_chat(): return [], None, [] # --- GRADIO INTERFACE --- css = """ .container { max-width: 1200px; margin: auto; } h1 { text-align: center; color: #4F46E5; font-size: 2.5em; } .image-container img { max-height: 400px; width: auto; } """ # <--- CORRECTION : Suppression de 'css' et 'theme' ici with gr.Blocks(title="Nano Vision Studio") as demo: gr.Markdown("# Nano 🍌 Vision Studio") gr.Markdown("### The Ultimate Interface: 4K Generation, Grounding, Multi-Image Composition & Iterative Chat") user_api_key_state = gr.State(os.environ.get("GOOGLE_API_KEY", "")) chat_state = gr.State(None) with gr.Tabs(): # --- TAB 0 : API --- with gr.TabItem("🔑 API Settings"): gr.Markdown("### ⚙️ API Configuration") with gr.Row(): with gr.Column(scale=3): api_input = gr.Textbox(label="Google Gemini API Key", type="password", lines=1) with gr.Column(scale=1): api_btn = gr.Button("Save & Initialize 💾", variant="primary") api_status = gr.Markdown() api_btn.click(update_api_key, inputs=[api_input], outputs=[api_status, user_api_key_state]) # --- TAB 1 : STUDIO --- with gr.TabItem("🎨 Creation Studio"): with gr.Row(): with gr.Column(scale=1): t1_prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the scene...") with gr.Group(): t1_model = gr.Dropdown(choices=list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model") with gr.Row(): t1_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio") t1_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)") t1_grounding = gr.Checkbox(label="Google Search (Grounding)") t1_btn = gr.Button("Generate ✨", variant="primary", size="lg") with gr.Column(scale=2): t1_gallery = gr.Gallery(label="Final Images", columns=2, height="auto") # <--- AJOUT : Composant pour afficher les sources HTML t1_sources = gr.HTML(label="Grounding Sources") t1_text = gr.Markdown(label="Generated Text") with gr.Accordion("🧠 Thought Process", open=False): t1_thought_imgs = gr.Gallery(label="Visual Drafts", columns=4, height=150) # Utilisation de Markdown pour un meilleur rendu du flux de pensée t1_thought_txt = gr.Markdown(label="Thought Stream") t1_btn.click( generate_studio, inputs=[t1_prompt, t1_model, t1_ratio, t1_res, t1_grounding, user_api_key_state], outputs=[t1_gallery, t1_text, t1_thought_imgs, t1_thought_txt, t1_sources] # <--- Ajout t1_sources dans les outputs ) # --- TAB 2 : COMPOSITION --- with gr.TabItem("🛠️ Composition"): with gr.Row(): with gr.Column(scale=1): t2_files = gr.File(label="Reference Images (Max 14)", file_count="multiple", type="filepath") t2_prompt = gr.Textbox(label="Instructions", lines=3) with gr.Accordion("Advanced Settings", open=False): t2_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model") with gr.Row(): t2_ratio = gr.Dropdown(RATIOS, value="1:1", label="Aspect Ratio") t2_res = gr.Dropdown(RESOLUTIONS, value="1K", label="Output Resolution") t2_grounding = gr.Checkbox(label="Google Search (Grounding)") # <--- AJOUT Grounding t2_btn = gr.Button("Run", variant="primary") with gr.Column(scale=2): t2_gallery = gr.Gallery(label="Result", columns=1) t2_text = gr.Markdown() t2_btn.click( generate_composition, inputs=[t2_prompt, t2_files, t2_model, t2_ratio, t2_res, t2_grounding, user_api_key_state], outputs=[t2_gallery, t2_text] ) # --- TAB 3 : CHAT --- with gr.TabItem("💬 Chat & Refinement"): with gr.Row(): with gr.Column(scale=2): # <--- CORRECTION : Suppression de 'type="messages"' ici chat_history = gr.Chatbot(label="Session History", height=600) with gr.Row(): chat_input = gr.Textbox(label="Your Message", scale=4) # chat_img = gr.Image(label="Input Image", type="filepath", height=100) chat_img = gr.File(label="Attach Images (Max 14)", file_count="multiple", type="filepath", height=100) with gr.Row(): chat_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("🗑️ New Session") with gr.Accordion("Chat Options", open=False): c_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model") with gr.Row(): c_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio") # <--- AJOUT Ratio c_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)") # <--- AJOUT Resolution c_grounding = gr.Checkbox(label="Grounding") with gr.Column(scale=1): chat_gallery_zoom = gr.Gallery(label="Zoom", columns=1, height="auto") chat_btn.click( chat_respond, inputs=[chat_input, chat_history, chat_state, chat_img, c_model, c_grounding, c_ratio, c_res, user_api_key_state], outputs=[chat_input, chat_img, chat_history, chat_state, chat_gallery_zoom] ) clear_btn.click( clear_chat, inputs=[], outputs=[chat_history, chat_state, chat_gallery_zoom] ) # --- TAB 4 : GUIDE --- with gr.TabItem("📚 Guide"): gr.Markdown(""" # Comprehensive Guide Welcome to the ultimate interface for **Nano Banana Pro** (Gemini 3 Pro) and **Nano Banana** (Gemini 2.5 Flash). ## 🚀 Choose Your Model | Feature | ⚡ Gemini 2.5 Flash (Nano Banana) | 🧠 Gemini 3 Pro (Nano Banana Pro) | | :--- | :--- | :--- | | **Best For** | Speed, High Volume, Prototyping | Professional Assets, Complex Logic, Text Rendering | | **Resolution** | 1024x1024 (Native) | Up to **4K** (High Fidelity) | | **Inputs** | Text + Images | Text + up to **14 Reference Images** | | **Special** | Fast & Efficient | **Thinking Process**, **Search Grounding** | --- ## ✨ Advanced Capabilities Explained ### 1. 🧠 The "Thinking" Process (Pro Only) Nano Banana Pro doesn't just draw; it *thinks*. Before generating pixels, it reasons through your prompt to understand composition, lighting, and logic. * **In this App:** Check the **"Thought Process"** accordion in the *Creation Studio* to read the model's internal monologue and see draft visualizations (Thought Images). ### 2. 🌍 Search Grounding (Real-Time Data) The model isn't stuck in the past. It can access **Google Search** to generate images based on live data. * **Try this:** "Visualize the current weather forecast for Tokyo as a modern chart." * **In this App:** Enable the **"Grounding"** checkbox. Sources will appear below the generated image. ### 3. 🖼️ Advanced Composition (up to 14 Images) While Flash handles fewer inputs, Pro can mix up to **14 images**! * **Use Case:** Style transfer, maintaining character consistency, or complex collages. * **How:** Use the **"Composition"** tab to upload multiple reference files. --- ## 💡 Prompting Masterclass To get the best results, follow these professional tips: * **Be Hyper-Specific:** Don't just say "a cat". Say *"A photorealistic close-up of a Siamese cat, golden hour lighting, captured with an 85mm lens"*. * **Provide Context:** Explain the intent. *"Create a logo for a high-end minimalist skincare brand"*. * **Positive Framing:** Instead of "no cars", describe the scene as *"an empty, deserted street"*. * **Iterate with Chat:** Don't expect perfection on turn #1. Use the **Chat & Refinement** tab to say *"Make the lighting warmer"* or *"Add a wizard hat"*. ## ⚡ Performance Tips * **4K Generation:** Available only on Pro. It costs more but delivers stunning print-quality results. * **Aspect Ratios:** We support everything from **1:1** to **21:9** (Cinematic). """) if __name__ == "__main__": threading.Thread(target=cleanup_old_files, daemon=True).start() demo.queue(default_concurrency_limit=20) # <--- CORRECTION : Ajout de 'css' et 'theme' ici demo.launch( theme=gr.themes.Soft(), css=css, max_threads=40, show_error=True, server_name="0.0.0.0", server_port=7860, share=False )