#!/usr/bin/env python3
"""
E1.8 Phase 1 — Ingest
Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
Run this first, then run e1_8_eval.py to pull predicate counts.
"""

import os, json, time, psycopg2, requests
from pathlib import Path
from dotenv import load_dotenv

load_dotenv(Path.home() / "aaronai" / ".env", override=True)

PG_DSN = os.getenv("PG_DSN")
GRAPHITI_URL = "http://localhost:8001"
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"

GROUP_TAXFREE  = "aaron_e18_taxfree"
GROUP_BASELINE = "aaron_e18_baseline"
GROUP_STANDARD = "aaron_e18_standard"

TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.

Do not summarize content. Do not extract entities. Do not assign a single category label.

Instead, describe:
- What domains or frames are active in this content (there may be several simultaneously)
- How those frames relate to each other in this specific document
- What kind of relational content a knowledge graph extractor should look for

Output JSON only. No prose, no explanation, no markdown.

Schema:
{
  "active_frames": ["<frame 1>", "<frame 2>", ...],
  "frame_relationships": "<one sentence describing how the frames interact in this document>",
  "extraction_orientation": "<one sentence orienting the extractor toward the most relationship-rich content>",
  "one_sentence_summary": "<one sentence describing what the document is about>"
}

Document:
"""

STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.

Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.

Output JSON only. No prose, no explanation, no markdown code fences.

Schema:
{
  "language": "<ISO 639-1 code>",
  "char_length": <integer>,
  "primary_format": "<prose|slides|code|structured|mixed>",
  "structural_signals": {
    "has_headings": <boolean>,
    "has_bullet_lists": <boolean>,
    "has_numbered_lists": <boolean>,
    "has_tables": <boolean>,
    "has_code_blocks": <boolean>,
    "has_dates": <boolean>
  },
  "content_signals": {
    "has_named_people": <boolean>,
    "has_institutional_language": <boolean>,
    "has_technical_terminology": <boolean>,
    "has_first_person": <boolean>,
    "has_quotations": <boolean>
  },
  "domain_class": "<technical|administrative|educational|personal|conversational>",
  "one_sentence_summary": "<one sentence describing what the document is about>"
}

Document:
"""

SUBSAMPLE_A = [
    {"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
    {"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
    {"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
    {"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
    {"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
    {"name": "Claude: Research Statement Restructure", "bucket": "mid"},
    {"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
    {"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
    {"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
    {"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
    {"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
    {"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
    {"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
]

SUBSAMPLE_B = [
    {"name": "ChatGPT: Job application comparison", "bucket": "high"},
    {"name": "ChatGPT: External review for tenure", "bucket": "high"},
    {"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
    {"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
    {"name": "ChatGPT: Analyze business plan", "bucket": "high"},
    {"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
    {"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
    {"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
    {"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
    {"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
    {"name": "NO thesis proposal.pdf", "bucket": "document"},
    {"name": "PWM.pdf", "bucket": "document"},
    {"name": "Will_It_Print.pdf", "bucket": "document"},
    {"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
    {"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
]


def get_pg():
    return psycopg2.connect(PG_DSN)


def get_document_text(source_name):
    pg = get_pg()
    cur = pg.cursor()
    cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
    rows = cur.fetchall()
    pg.close()
    return " ".join(r[0] for r in rows)[:12000]


def run_mistral(prompt_prefix, doc_text, label=""):
    print(f"    → Mistral {label} running...", flush=True)
    payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
    resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
    resp.raise_for_status()
    raw = resp.json().get("response", "{}")
    print(f"    → Mistral {label} done ({len(raw)} chars)", flush=True)
    try:
        return json.loads(raw)
    except Exception:
        return {"error": "parse_failed", "raw": raw[:200]}


def build_taxfree_orientation(meta):
    frames = ", ".join(meta.get("active_frames", []))
    rel = meta.get("frame_relationships", "")
    orient = meta.get("extraction_orientation", "")
    summary = meta.get("one_sentence_summary", "")
    return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"


def build_standard_orientation(meta):
    dc = meta.get("domain_class", "unknown")
    pf = meta.get("primary_format", "unknown")
    summary = meta.get("one_sentence_summary", "")
    cs = meta.get("content_signals", {})
    return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
            f"has_named_people: {cs.get('has_named_people', False)}\n"
            f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")


def ingest(source_name, doc_text, orientation, group_id):
    payload = {
        "episodes": [{
            "name": source_name,
            "content": doc_text[:12000],
            "source_description": orientation,
            "timestamp": "2026-04-28T00:00:00",
        }],
        "group_id": group_id,
    }
    resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
    resp.raise_for_status()


def save(results):
    RESULTS_PATH.write_text(json.dumps(results, indent=2))


def run():
    print("E1.8 — Ingest phase")
    print("=" * 60)

    # Load existing results if resuming
    if RESULTS_PATH.exists():
        results = json.loads(RESULTS_PATH.read_text())
        done_a = {r["name"] for r in results.get("subsample_a", [])}
        done_b = {r["name"] for r in results.get("subsample_b", [])}
        print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
    else:
        results = {"subsample_a": [], "subsample_b": []}
        done_a, done_b = set(), set()

    e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
    e14_by_name = {s["name"]: s for s in e14_data}

    # Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
    print("\nSub-sample A — taxonomy-free ingestion only")
    for item in SUBSAMPLE_A:
        name = item["name"]
        if name in done_a:
            print(f"  SKIP (done): {name}")
            continue
        print(f"\n  {name}")
        doc_text = get_document_text(name)
        if not doc_text:
            print(f"  SKIP — no text")
            continue

        tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
        print(f"  frames: {tf_meta.get('active_frames', 'ERROR')}")
        orientation = build_taxfree_orientation(tf_meta)

        try:
            ingest(name, doc_text, orientation, GROUP_TAXFREE)
            time.sleep(3)
            print(f"  ingested to {GROUP_TAXFREE}")
        except Exception as e:
            print(f"  ingest failed: {e}")
            continue

        e14 = e14_by_name.get(name, {})
        results["subsample_a"].append({
            "name": name,
            "bucket": item["bucket"],
            "taxfree_metadata": tf_meta,
            "taxfree_orientation": orientation,
            "e14_prod_preds": e14.get("prod_preds"),
            "e14_cascade_preds": e14.get("cascade_preds"),
            "e14_delta_preds": e14.get("delta_preds"),
            "e14_prod_edges": e14.get("prod_edges"),
            "e14_cascade_edges": e14.get("cascade_edges"),
            "e14_delta_edges": e14.get("delta_edges"),
        })
        save(results)

    # Sub-sample B — all three conditions
    print("\nSub-sample B — all three conditions")
    for item in SUBSAMPLE_B:
        name = item["name"]
        if name in done_b:
            print(f"  SKIP (done): {name}")
            continue
        print(f"\n  {name} ({item['bucket']})")
        doc_text = get_document_text(name)
        if not doc_text:
            print(f"  SKIP — no text")
            continue

        entry = {"name": name, "bucket": item["bucket"],
                 "taxfree_metadata": None, "standard_metadata": None}

        # Baseline
        try:
            ingest(name, doc_text, "", GROUP_BASELINE)
            time.sleep(3)
            print(f"  baseline ingested")
        except Exception as e:
            print(f"  baseline failed: {e}")

        # Standard
        std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
        entry["standard_metadata"] = std_meta
        try:
            ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
            time.sleep(3)
            print(f"  standard ingested, domain_class={std_meta.get('domain_class','?')}")
        except Exception as e:
            print(f"  standard failed: {e}")

        # Taxonomy-free
        tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
        entry["taxfree_metadata"] = tf_meta
        print(f"  frames: {tf_meta.get('active_frames', 'ERROR')}")
        try:
            ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
            time.sleep(3)
            print(f"  taxfree ingested")
        except Exception as e:
            print(f"  taxfree failed: {e}")

        results["subsample_b"].append(entry)
        save(results)

    print("\n" + "=" * 60)
    print(f"Ingest complete. Results at {RESULTS_PATH}")
    print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")


if __name__ == "__main__":
    run()