# app.py import os import time import re import feedparser import gradio as gr from typing import List, Dict, Tuple, Optional # ------------------------- # Environment & cache # ------------------------- CACHE_DIR = "/tmp/hf_cache" os.makedirs(CACHE_DIR, exist_ok=True) os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR os.environ["HF_HOME"] = CACHE_DIR # Models used SUMMARIZER_MODEL = "sshleifer/distilbart-cnn-6-6" REWRITER_MODEL = "google/flan-t5-small" # Lazy pipeline holders _summarizer = None _rewriter = None # ------------------------- # Helpers: load pipelines lazily # ------------------------- def load_summarizer(): global _summarizer if _summarizer is None: try: from transformers import pipeline _summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1) except Exception as e: raise RuntimeError(f"Failed to load summarizer '{SUMMARIZER_MODEL}': {e}") return _summarizer def load_rewriter(): global _rewriter if _rewriter is None: try: from transformers import pipeline _rewriter = pipeline("text2text-generation", model=REWRITER_MODEL, device=-1) except Exception as e: raise RuntimeError(f"Failed to load rewriter '{REWRITER_MODEL}': {e}") return _rewriter # ------------------------- # RSS fetcher # ------------------------- def fetch_feed_entries(rss_url: str, max_items: int = 10) -> List[Dict]: feed = feedparser.parse(rss_url) entries = [] if not feed or not getattr(feed, "entries", None): return entries for e in feed.entries[:max_items]: title = (e.get("title") or "").strip() summary = (e.get("summary") or e.get("description") or "").strip() link = e.get("link", "") published = e.get("published", "") or e.get("updated", "") entries.append({ "title": title, "summary": summary, "link": link, "published": published }) return entries # ------------------------- # Repetition cleanup # ------------------------- def cleanup_repetition(text: str) -> str: if not text: return text text = re.sub(r'\b(\w+)(?:\s+\1){3,}\b', r'\1', text) text = re.sub( r'(\b[\w\s\'",-]{3,50}?)\s*(?:[,\.\s]*)?(?:\1\s*(?:[,\.\s]*)?){3,}', r'\1', text, flags=re.IGNORECASE ) text = re.sub(r'(\!){3,}', r'!', text) text = re.sub(r'(\.){4,}', '...', text) text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r'\s{3,}', ' ', text) return text.strip() # ------------------------- # Safe summarizer # ------------------------- def safe_summarize(text: str) -> str: # If empty or very short, return as-is text = (text or "").strip() if not text: return "" try: summ = load_summarizer() # summarizer expects longer text; we pass truncated text args = {"max_length": 120, "min_length": 30, "do_sample": False} try: res = summ(text[:4000], **args) return (res[0].get("summary_text") or "").strip() except TypeError: # older/newer pipeline signature fallback res = summ(text[:4000], max_length=120, min_length=30) return (res[0].get("summary_text") or "").strip() except Exception: # fallback: return truncated input return text[:300] # ------------------------- # Hallucination heuristic # ------------------------- def detect_new_proper_nouns(original: str, generated: str, min_token_len: int = 3) -> bool: """ Heuristic: detect capitalized tokens in generated output that are not present in original. It's conservative: may flag some false positives (sentence starts) but prevents major invented facts. """ if not generated: return False def proper_tokens(s: str): # words starting with uppercase and of certain length toks = re.findall(r'\b[A-Z][a-zA-Z0-9]{%d,}\b' % (min_token_len-1), s) return set([t.lower() for t in toks]) orig_set = proper_tokens(original or "") gen_set = proper_tokens(generated or "") # Allow tokens that appear in both new = gen_set - orig_set # Filter out single-word sentence starts that are common words (e.g., "The") new = {w for w in new if len(w) >= min_token_len and not w.lower() in {"the", "a", "an", "and", "but", "for", "from"}} return len(new) > 0 # ------------------------- # Rewriter: safe prompt + generation + post-check # ------------------------- def rewrite_positive_text_safe(title: str, summary: str, temperature: float = 0.25, max_new_tokens: int = 120, return_raw: bool = False) -> Dict[str, str]: """ Two-step: summarize input -> rewrite with 'do not invent facts'. Returns dict with keys: - headline, summary, raw (optional) """ # Step 1: create base_summary (use provided summary or generate one if summary too long/empty) base_text = (summary or "").strip() if not base_text or len(base_text) > 300: # combine title+summary for summarizer combined = (title or "") + "\n\n" + (summary or "") combined = combined[:4000] base_summary = safe_summarize(combined) else: base_summary = base_text # Build prompt prompt = ( "You will rewrite the following short article headline and short summary into a positive and accurate tone.\n" "- Do NOT add new facts, proper names, locations, figures, or events that are not present in the input.\n" "- If you cannot rewrite without inventing facts, write exactly: \"[REVIEW REQUIRED - cannot rewrite without adding facts]\" as the SUMMARY.\n\n" "Format exactly (no extra text):\nHEADLINE: \nSUMMARY: <3 short sentences, positive tone>\n\n" "Example:\nINPUT HEADLINE: Local library reopens after renovation\nINPUT SUMMARY: The town library reopened after a two-month renovation, volunteers helped restore shelves.\nHEADLINE: Community library reopens after successful renovation\nSUMMARY: Volunteers and staff worked together to reopen the local library, returning a vital community space for reading and study.\n\n" f"Now rewrite this:\nINPUT HEADLINE: {title}\nINPUT SUMMARY: {base_summary}\n\nOutput:" ) try: rewriter = load_rewriter() except Exception as e: return {"headline": title or "(model error)", "summary": f"[MODEL LOAD ERROR] {e}", "raw": ""} # Generate with controlled sampling - fallback if pipeline doesn't accept args raw_out = "" try: out_obj = rewriter( prompt, max_new_tokens=max_new_tokens, num_beams=2, do_sample=True, temperature=float(max(0.0, min(1.0, temperature))), top_p=0.9, no_repeat_ngram_size=3, early_stopping=True, ) raw_out = out_obj[0].get("generated_text", "") if isinstance(out_obj, list) else str(out_obj) except TypeError: # some pipeline versions expect different arg names try: out_obj = rewriter(prompt, max_length= max_new_tokens + 50, num_beams=2) raw_out = out_obj[0].get("generated_text", "") if isinstance(out_obj, list) else str(out_obj) except Exception as e: return {"headline": title or "(generation error)", "summary": f"[GENERATION ERROR] {e}", "raw": ""} raw = (raw_out or "").strip() # Parse HEADLINE: and SUMMARY: head = "" summ_out = "" if re.search(r'HEADLINE\s*:', raw, flags=re.IGNORECASE): try: tail = re.split(r'HEADLINE\s*:\s*', raw, flags=re.IGNORECASE, maxsplit=1)[1] parts = re.split(r'SUMMARY\s*:\s*', tail, flags=re.IGNORECASE, maxsplit=1) head = (parts[0].strip() if parts else "").strip() summ_out = (parts[1].strip() if len(parts) > 1 else "").strip() except Exception: pass # Fallback parsing heuristics if not head: lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] if lines: head = lines[0][:200] summ_out = " ".join(lines[1:])[:1000] head = cleanup_repetition(head) summ_out = cleanup_repetition(summ_out) # Hallucination detection: look for new proper nouns / capitalized tokens orig_text = (title or "") + " " + (summary or "") if detect_new_proper_nouns(orig_text, head + " " + summ_out): # suspicious: produce conservative fallback return { "headline": (title or "")[:120], "summary": "[REVIEW REQUIRED - cannot rewrite without adding facts]", "raw": raw if return_raw else "" } # Final fallbacks if not head: head = (title or "")[:120] if not summ_out: summ_out = (summary or "")[:400] result = {"headline": head, "summary": summ_out} if return_raw: result["raw"] = raw return result # ------------------------- # Analyze & rewrite pipeline # ------------------------- def analyze_and_rewrite(rss_url: str, max_items: int = 10, rewrite_count: int = 2, temperature: float = 0.25, max_new_tokens: int = 120, return_raw: bool = False): start = time.time() entries = fetch_feed_entries(rss_url, max_items=max_items) fetched_count = len(entries) rewrites = [] rewrite_count = max(0, min(rewrite_count, fetched_count)) if rewrite_count > 0: # Attempt to load rewriter (to show friendly error early) try: load_rewriter() except Exception as e: elapsed = time.time() - start return {"fetched_count": fetched_count, "entries": entries, "rewrites": [{"error": f"Model load error: {e}"}], "elapsed": elapsed} for i in range(rewrite_count): e = entries[i] try: r = rewrite_positive_text_safe(e["title"], e["summary"], temperature=temperature, max_new_tokens=max_new_tokens, return_raw=return_raw) except Exception as ex: r = {"headline": f"(rewrite error) {ex}", "summary": ""} rr = { "index": i, "original_title": e["title"], "original_summary": e["summary"], "link": e["link"], "published": e["published"], "positive_headline": r.get("headline", ""), "positive_summary": r.get("summary", ""), } if return_raw: rr["raw"] = r.get("raw", "") rewrites.append(rr) elapsed = time.time() - start return {"fetched_count": fetched_count, "entries": entries, "rewrites": rewrites, "elapsed": elapsed} # ------------------------- # UI / Gradio # ------------------------- DEFAULT_RSS = "https://timesofindia.indiatimes.com/rssfeedstopstories.cms" def run_and_render(rss_url: str, max_items: int, rewrite_count: int, temperature: float, max_new_tokens: int, show_raw: bool): if not rss_url: rss_url = DEFAULT_RSS result = analyze_and_rewrite(rss_url, max_items=max_items, rewrite_count=rewrite_count, temperature=temperature, max_new_tokens=max_new_tokens, return_raw=show_raw) # build fetched markdown fetched_md = f"**Fetched articles:** {result['fetched_count']}\n\n" for idx, e in enumerate(result["entries"]): published = e.get("published", "") fetched_md += f"- [{idx+1}] **{e.get('title','(no title)')}** \n" if published: fetched_md += f" • {published} \n" if e.get("link"): fetched_md += f" • [source]({e.get('link')}) \n" # rewrites markdown rew_md = "## ✅ Positive rewrites\n\n" if result["rewrites"]: for w in result["rewrites"]: # if error if w.get("error"): rew_md += f"**Error:** {w['error']}\n\n" continue rew_md += f"### Original #{w['index']+1}: [{w['original_title']}]({w['link']})\n\n" rew_md += f"**Positive headline:** {w['positive_headline']} \n\n" rew_md += f"**Positive summary:** {w['positive_summary']} \n\n" if show_raw and w.get("raw"): rew_md += f"**Raw model output:**\n```\n{w['raw']}\n```\n\n" rew_md += "---\n" else: rew_md += "_No rewrites produced._ \n" elapsed = f"{result['elapsed']:.1f}" return fetched_md, rew_md, elapsed with gr.Blocks(title="📊 Good News App — Prototype") as demo: gr.Markdown("# 📊 Good News App — Prototype") gr.Markdown("Fetches articles from an RSS feed and rewrites the first M articles to a positive tone using a small instruction model. Human review is required before publishing.") with gr.Row(): rss_input = gr.Textbox(label="RSS feed URL", value=DEFAULT_RSS) with gr.Row(): max_items = gr.Slider(label="Max articles to fetch", minimum=1, maximum=30, value=10, step=1) rewrite_count = gr.Slider(label="Number of articles to rewrite (first M)", minimum=0, maximum=10, value=2, step=1) with gr.Row(): temperature = gr.Slider(label="Temperature (creativity vs safety)", minimum=0.0, maximum=1.0, step=0.05, value=0.25) max_new_tokens = gr.Slider(label="Max new tokens (rewrite length)", minimum=40, maximum=300, step=10, value=120) show_raw = gr.Checkbox(label="Show raw model output (debug)", value=False) run_btn = gr.Button("Fetch & Rewrite") with gr.Row(): elapsed_out = gr.Textbox(label="Elapsed (seconds)", interactive=False) fetched_panel = gr.Markdown("") rewrites_panel = gr.Markdown("") run_btn.click(fn=run_and_render, inputs=[rss_input, max_items, rewrite_count, temperature, max_new_tokens, show_raw], outputs=[fetched_panel, rewrites_panel, elapsed_out]) gr.Markdown("---") gr.Markdown("Notes: This prototype uses small models to stay within free compute limits. The app prevents obvious hallucinations and will flag outputs requiring human review. If you need higher fidelity, consider moving to larger models on GPUs or using Hugging Face Inference API paid endpoints.") if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)