# app.py
import os
import time
import re
import feedparser
import gradio as gr
from typing import List, Dict, Tuple, Optional

# -------------------------
# Environment & cache
# -------------------------
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_HOME"] = CACHE_DIR

# Models used
SUMMARIZER_MODEL = "sshleifer/distilbart-cnn-6-6"
REWRITER_MODEL = "google/flan-t5-small"

# Lazy pipeline holders
_summarizer = None
_rewriter = None

# -------------------------
# Helpers: load pipelines lazily
# -------------------------
def load_summarizer():
    global _summarizer
    if _summarizer is None:
        try:
            from transformers import pipeline
            _summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1)
        except Exception as e:
            raise RuntimeError(f"Failed to load summarizer '{SUMMARIZER_MODEL}': {e}")
    return _summarizer

def load_rewriter():
    global _rewriter
    if _rewriter is None:
        try:
            from transformers import pipeline
            _rewriter = pipeline("text2text-generation", model=REWRITER_MODEL, device=-1)
        except Exception as e:
            raise RuntimeError(f"Failed to load rewriter '{REWRITER_MODEL}': {e}")
    return _rewriter

# -------------------------
# RSS fetcher
# -------------------------
def fetch_feed_entries(rss_url: str, max_items: int = 10) -> List[Dict]:
    feed = feedparser.parse(rss_url)
    entries = []
    if not feed or not getattr(feed, "entries", None):
        return entries
    for e in feed.entries[:max_items]:
        title = (e.get("title") or "").strip()
        summary = (e.get("summary") or e.get("description") or "").strip()
        link = e.get("link", "")
        published = e.get("published", "") or e.get("updated", "")
        entries.append({
            "title": title,
            "summary": summary,
            "link": link,
            "published": published
        })
    return entries

# -------------------------
# Repetition cleanup
# -------------------------
def cleanup_repetition(text: str) -> str:
    if not text:
        return text
    text = re.sub(r'\b(\w+)(?:\s+\1){3,}\b', r'\1', text)
    text = re.sub(
        r'(\b[\w\s\'",-]{3,50}?)\s*(?:[,\.\s]*)?(?:\1\s*(?:[,\.\s]*)?){3,}',
        r'\1',
        text,
        flags=re.IGNORECASE
    )
    text = re.sub(r'(\!){3,}', r'!', text)
    text = re.sub(r'(\.){4,}', '...', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'\s{3,}', ' ', text)
    return text.strip()

# -------------------------
# Safe summarizer
# -------------------------
def safe_summarize(text: str) -> str:
    # If empty or very short, return as-is
    text = (text or "").strip()
    if not text:
        return ""
    try:
        summ = load_summarizer()
        # summarizer expects longer text; we pass truncated text
        args = {"max_length": 120, "min_length": 30, "do_sample": False}
        try:
            res = summ(text[:4000], **args)
            return (res[0].get("summary_text") or "").strip()
        except TypeError:
            # older/newer pipeline signature fallback
            res = summ(text[:4000], max_length=120, min_length=30)
            return (res[0].get("summary_text") or "").strip()
    except Exception:
        # fallback: return truncated input
        return text[:300]

# -------------------------
# Hallucination heuristic
# -------------------------
def detect_new_proper_nouns(original: str, generated: str, min_token_len: int = 3) -> bool:
    """
    Heuristic: detect capitalized tokens in generated output that are not present in original.
    It's conservative: may flag some false positives (sentence starts) but prevents major invented facts.
    """
    if not generated:
        return False
    def proper_tokens(s: str):
        # words starting with uppercase and of certain length
        toks = re.findall(r'\b[A-Z][a-zA-Z0-9]{%d,}\b' % (min_token_len-1), s)
        return set([t.lower() for t in toks])
    orig_set = proper_tokens(original or "")
    gen_set = proper_tokens(generated or "")
    # Allow tokens that appear in both
    new = gen_set - orig_set
    # Filter out single-word sentence starts that are common words (e.g., "The")
    new = {w for w in new if len(w) >= min_token_len and not w.lower() in {"the", "a", "an", "and", "but", "for", "from"}}
    return len(new) > 0

# -------------------------
# Rewriter: safe prompt + generation + post-check
# -------------------------
def rewrite_positive_text_safe(title: str, summary: str, temperature: float = 0.25, max_new_tokens: int = 120, return_raw: bool = False) -> Dict[str, str]:
    """
    Two-step: summarize input -> rewrite with 'do not invent facts'. Returns dict with keys:
    - headline, summary, raw (optional)
    """
    # Step 1: create base_summary (use provided summary or generate one if summary too long/empty)
    base_text = (summary or "").strip()
    if not base_text or len(base_text) > 300:
        # combine title+summary for summarizer
        combined = (title or "") + "\n\n" + (summary or "")
        combined = combined[:4000]
        base_summary = safe_summarize(combined)
    else:
        base_summary = base_text

    # Build prompt
    prompt = (
        "You will rewrite the following short article headline and short summary into a positive and accurate tone.\n"
        "- Do NOT add new facts, proper names, locations, figures, or events that are not present in the input.\n"
        "- If you cannot rewrite without inventing facts, write exactly: \"[REVIEW REQUIRED - cannot rewrite without adding facts]\" as the SUMMARY.\n\n"
        "Format exactly (no extra text):\nHEADLINE: <short positive headline>\nSUMMARY: <3 short sentences, positive tone>\n\n"
        "Example:\nINPUT HEADLINE: Local library reopens after renovation\nINPUT SUMMARY: The town library reopened after a two-month renovation, volunteers helped restore shelves.\nHEADLINE: Community library reopens after successful renovation\nSUMMARY: Volunteers and staff worked together to reopen the local library, returning a vital community space for reading and study.\n\n"
        f"Now rewrite this:\nINPUT HEADLINE: {title}\nINPUT SUMMARY: {base_summary}\n\nOutput:"
    )

    try:
        rewriter = load_rewriter()
    except Exception as e:
        return {"headline": title or "(model error)", "summary": f"[MODEL LOAD ERROR] {e}", "raw": ""}

    # Generate with controlled sampling - fallback if pipeline doesn't accept args
    raw_out = ""
    try:
        out_obj = rewriter(
            prompt,
            max_new_tokens=max_new_tokens,
            num_beams=2,
            do_sample=True,
            temperature=float(max(0.0, min(1.0, temperature))),
            top_p=0.9,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
        raw_out = out_obj[0].get("generated_text", "") if isinstance(out_obj, list) else str(out_obj)
    except TypeError:
        # some pipeline versions expect different arg names
        try:
            out_obj = rewriter(prompt, max_length= max_new_tokens + 50, num_beams=2)
            raw_out = out_obj[0].get("generated_text", "") if isinstance(out_obj, list) else str(out_obj)
        except Exception as e:
            return {"headline": title or "(generation error)", "summary": f"[GENERATION ERROR] {e}", "raw": ""}

    raw = (raw_out or "").strip()

    # Parse HEADLINE: and SUMMARY:
    head = ""
    summ_out = ""
    if re.search(r'HEADLINE\s*:', raw, flags=re.IGNORECASE):
        try:
            tail = re.split(r'HEADLINE\s*:\s*', raw, flags=re.IGNORECASE, maxsplit=1)[1]
            parts = re.split(r'SUMMARY\s*:\s*', tail, flags=re.IGNORECASE, maxsplit=1)
            head = (parts[0].strip() if parts else "").strip()
            summ_out = (parts[1].strip() if len(parts) > 1 else "").strip()
        except Exception:
            pass

    # Fallback parsing heuristics
    if not head:
        lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
        if lines:
            head = lines[0][:200]
            summ_out = " ".join(lines[1:])[:1000]

    head = cleanup_repetition(head)
    summ_out = cleanup_repetition(summ_out)

    # Hallucination detection: look for new proper nouns / capitalized tokens
    orig_text = (title or "") + " " + (summary or "")
    if detect_new_proper_nouns(orig_text, head + " " + summ_out):
        # suspicious: produce conservative fallback
        return {
            "headline": (title or "")[:120],
            "summary": "[REVIEW REQUIRED - cannot rewrite without adding facts]",
            "raw": raw if return_raw else ""
        }

    # Final fallbacks
    if not head:
        head = (title or "")[:120]
    if not summ_out:
        summ_out = (summary or "")[:400]

    result = {"headline": head, "summary": summ_out}
    if return_raw:
        result["raw"] = raw
    return result

# -------------------------
# Analyze & rewrite pipeline
# -------------------------
def analyze_and_rewrite(rss_url: str, max_items: int = 10, rewrite_count: int = 2, temperature: float = 0.25, max_new_tokens: int = 120, return_raw: bool = False):
    start = time.time()
    entries = fetch_feed_entries(rss_url, max_items=max_items)
    fetched_count = len(entries)
    rewrites = []

    rewrite_count = max(0, min(rewrite_count, fetched_count))
    if rewrite_count > 0:
        # Attempt to load rewriter (to show friendly error early)
        try:
            load_rewriter()
        except Exception as e:
            elapsed = time.time() - start
            return {"fetched_count": fetched_count, "entries": entries, "rewrites": [{"error": f"Model load error: {e}"}], "elapsed": elapsed}

    for i in range(rewrite_count):
        e = entries[i]
        try:
            r = rewrite_positive_text_safe(e["title"], e["summary"], temperature=temperature, max_new_tokens=max_new_tokens, return_raw=return_raw)
        except Exception as ex:
            r = {"headline": f"(rewrite error) {ex}", "summary": ""}
        rr = {
            "index": i,
            "original_title": e["title"],
            "original_summary": e["summary"],
            "link": e["link"],
            "published": e["published"],
            "positive_headline": r.get("headline", ""),
            "positive_summary": r.get("summary", ""),
        }
        if return_raw:
            rr["raw"] = r.get("raw", "")
        rewrites.append(rr)

    elapsed = time.time() - start
    return {"fetched_count": fetched_count, "entries": entries, "rewrites": rewrites, "elapsed": elapsed}

# -------------------------
# UI / Gradio
# -------------------------
DEFAULT_RSS = "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"

def run_and_render(rss_url: str, max_items: int, rewrite_count: int, temperature: float, max_new_tokens: int, show_raw: bool):
    if not rss_url:
        rss_url = DEFAULT_RSS
    result = analyze_and_rewrite(rss_url, max_items=max_items, rewrite_count=rewrite_count, temperature=temperature, max_new_tokens=max_new_tokens, return_raw=show_raw)

    # build fetched markdown
    fetched_md = f"**Fetched articles:** {result['fetched_count']}\n\n"
    for idx, e in enumerate(result["entries"]):
        published = e.get("published", "")
        fetched_md += f"- [{idx+1}] **{e.get('title','(no title)')}**  \n"
        if published:
            fetched_md += f"   • {published}  \n"
        if e.get("link"):
            fetched_md += f"   • [source]({e.get('link')})  \n"

    # rewrites markdown
    rew_md = "## ✅ Positive rewrites\n\n"
    if result["rewrites"]:
        for w in result["rewrites"]:
            # if error
            if w.get("error"):
                rew_md += f"**Error:** {w['error']}\n\n"
                continue
            rew_md += f"### Original #{w['index']+1}: [{w['original_title']}]({w['link']})\n\n"
            rew_md += f"**Positive headline:** {w['positive_headline']}  \n\n"
            rew_md += f"**Positive summary:** {w['positive_summary']}  \n\n"
            if show_raw and w.get("raw"):
                rew_md += f"**Raw model output:**\n```\n{w['raw']}\n```\n\n"
            rew_md += "---\n"
    else:
        rew_md += "_No rewrites produced._  \n"

    elapsed = f"{result['elapsed']:.1f}"
    return fetched_md, rew_md, elapsed

with gr.Blocks(title="📊 Good News App — Prototype") as demo:
    gr.Markdown("# 📊 Good News App — Prototype")
    gr.Markdown("Fetches articles from an RSS feed and rewrites the first M articles to a positive tone using a small instruction model. Human review is required before publishing.")

    with gr.Row():
        rss_input = gr.Textbox(label="RSS feed URL", value=DEFAULT_RSS)
    with gr.Row():
        max_items = gr.Slider(label="Max articles to fetch", minimum=1, maximum=30, value=10, step=1)
        rewrite_count = gr.Slider(label="Number of articles to rewrite (first M)", minimum=0, maximum=10, value=2, step=1)
    with gr.Row():
        temperature = gr.Slider(label="Temperature (creativity vs safety)", minimum=0.0, maximum=1.0, step=0.05, value=0.25)
        max_new_tokens = gr.Slider(label="Max new tokens (rewrite length)", minimum=40, maximum=300, step=10, value=120)
        show_raw = gr.Checkbox(label="Show raw model output (debug)", value=False)
    run_btn = gr.Button("Fetch & Rewrite")

    with gr.Row():
        elapsed_out = gr.Textbox(label="Elapsed (seconds)", interactive=False)
    fetched_panel = gr.Markdown("")
    rewrites_panel = gr.Markdown("")

    run_btn.click(fn=run_and_render, inputs=[rss_input, max_items, rewrite_count, temperature, max_new_tokens, show_raw], outputs=[fetched_panel, rewrites_panel, elapsed_out])

    gr.Markdown("---")
    gr.Markdown("Notes: This prototype uses small models to stay within free compute limits. The app prevents obvious hallucinations and will flag outputs requiring human review. If you need higher fidelity, consider moving to larger models on GPUs or using Hugging Face Inference API paid endpoints.")

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)