add experiment scripts and results; watcher.py latest changes

2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+"""
+E1.8 Phase 1 — Ingest
+Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
+Run this first, then run e1_8_eval.py to pull predicate counts.
+"""
+
+import os, json, time, psycopg2, requests
+from pathlib import Path
+from dotenv import load_dotenv
+
+load_dotenv(Path.home() / "aaronai" / ".env", override=True)
+
+PG_DSN = os.getenv("PG_DSN")
+GRAPHITI_URL = "http://localhost:8001"
+RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
+
+GROUP_TAXFREE  = "aaron_e18_taxfree"
+GROUP_BASELINE = "aaron_e18_baseline"
+GROUP_STANDARD = "aaron_e18_standard"
+
+TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.
+
+Do not summarize content. Do not extract entities. Do not assign a single category label.
+
+Instead, describe:
+- What domains or frames are active in this content (there may be several simultaneously)
+- How those frames relate to each other in this specific document
+- What kind of relational content a knowledge graph extractor should look for
+
+Output JSON only. No prose, no explanation, no markdown.
+
+Schema:
+{
+  "active_frames": ["<frame 1>", "<frame 2>", ...],
+  "frame_relationships": "<one sentence describing how the frames interact in this document>",
+  "extraction_orientation": "<one sentence orienting the extractor toward the most relationship-rich content>",
+  "one_sentence_summary": "<one sentence describing what the document is about>"
+}
+
+Document:
+"""
+
+STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
+
+Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
+
+Output JSON only. No prose, no explanation, no markdown code fences.
+
+Schema:
+{
+  "language": "<ISO 639-1 code>",
+  "char_length": <integer>,
+  "primary_format": "<prose|slides|code|structured|mixed>",
+  "structural_signals": {
+    "has_headings": <boolean>,
+    "has_bullet_lists": <boolean>,
+    "has_numbered_lists": <boolean>,
+    "has_tables": <boolean>,
+    "has_code_blocks": <boolean>,
+    "has_dates": <boolean>
+  },
+  "content_signals": {
+    "has_named_people": <boolean>,
+    "has_institutional_language": <boolean>,
+    "has_technical_terminology": <boolean>,
+    "has_first_person": <boolean>,
+    "has_quotations": <boolean>
+  },
+  "domain_class": "<technical|administrative|educational|personal|conversational>",
+  "one_sentence_summary": "<one sentence describing what the document is about>"
+}
+
+Document:
+"""
+
+SUBSAMPLE_A = [
+    {"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
+    {"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
+    {"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
+    {"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
+    {"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
+    {"name": "Claude: Research Statement Restructure", "bucket": "mid"},
+    {"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
+    {"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
+    {"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
+    {"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
+    {"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
+    {"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
+    {"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
+]
+
+SUBSAMPLE_B = [
+    {"name": "ChatGPT: Job application comparison", "bucket": "high"},
+    {"name": "ChatGPT: External review for tenure", "bucket": "high"},
+    {"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
+    {"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
+    {"name": "ChatGPT: Analyze business plan", "bucket": "high"},
+    {"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
+    {"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
+    {"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
+    {"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
+    {"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
+    {"name": "NO thesis proposal.pdf", "bucket": "document"},
+    {"name": "PWM.pdf", "bucket": "document"},
+    {"name": "Will_It_Print.pdf", "bucket": "document"},
+    {"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
+    {"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
+]
+
+
+def get_pg():
+    return psycopg2.connect(PG_DSN)
+
+
+def get_document_text(source_name):
+    pg = get_pg()
+    cur = pg.cursor()
+    cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
+    rows = cur.fetchall()
+    pg.close()
+    return " ".join(r[0] for r in rows)[:12000]
+
+
+def run_mistral(prompt_prefix, doc_text, label=""):
+    print(f"    → Mistral {label} running...", flush=True)
+    payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
+    resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
+    resp.raise_for_status()
+    raw = resp.json().get("response", "{}")
+    print(f"    → Mistral {label} done ({len(raw)} chars)", flush=True)
+    try:
+        return json.loads(raw)
+    except Exception:
+        return {"error": "parse_failed", "raw": raw[:200]}
+
+
+def build_taxfree_orientation(meta):
+    frames = ", ".join(meta.get("active_frames", []))
+    rel = meta.get("frame_relationships", "")
+    orient = meta.get("extraction_orientation", "")
+    summary = meta.get("one_sentence_summary", "")
+    return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"
+
+
+def build_standard_orientation(meta):
+    dc = meta.get("domain_class", "unknown")
+    pf = meta.get("primary_format", "unknown")
+    summary = meta.get("one_sentence_summary", "")
+    cs = meta.get("content_signals", {})
+    return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
+            f"has_named_people: {cs.get('has_named_people', False)}\n"
+            f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")
+
+
+def ingest(source_name, doc_text, orientation, group_id):
+    payload = {
+        "episodes": [{
+            "name": source_name,
+            "content": doc_text[:12000],
+            "source_description": orientation,
+            "timestamp": "2026-04-28T00:00:00",
+        }],
+        "group_id": group_id,
+    }
+    resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
+    resp.raise_for_status()
+
+
+def save(results):
+    RESULTS_PATH.write_text(json.dumps(results, indent=2))
+
+
+def run():
+    print("E1.8 — Ingest phase")
+    print("=" * 60)
+
+    # Load existing results if resuming
+    if RESULTS_PATH.exists():
+        results = json.loads(RESULTS_PATH.read_text())
+        done_a = {r["name"] for r in results.get("subsample_a", [])}
+        done_b = {r["name"] for r in results.get("subsample_b", [])}
+        print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
+    else:
+        results = {"subsample_a": [], "subsample_b": []}
+        done_a, done_b = set(), set()
+
+    e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
+    e14_by_name = {s["name"]: s for s in e14_data}
+
+    # Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
+    print("\nSub-sample A — taxonomy-free ingestion only")
+    for item in SUBSAMPLE_A:
+        name = item["name"]
+        if name in done_a:
+            print(f"  SKIP (done): {name}")
+            continue
+        print(f"\n  {name}")
+        doc_text = get_document_text(name)
+        if not doc_text:
+            print(f"  SKIP — no text")
+            continue
+
+        tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
+        print(f"  frames: {tf_meta.get('active_frames', 'ERROR')}")
+        orientation = build_taxfree_orientation(tf_meta)
+
+        try:
+            ingest(name, doc_text, orientation, GROUP_TAXFREE)
+            time.sleep(3)
+            print(f"  ingested to {GROUP_TAXFREE}")
+        except Exception as e:
+            print(f"  ingest failed: {e}")
+            continue
+
+        e14 = e14_by_name.get(name, {})
+        results["subsample_a"].append({
+            "name": name,
+            "bucket": item["bucket"],
+            "taxfree_metadata": tf_meta,
+            "taxfree_orientation": orientation,
+            "e14_prod_preds": e14.get("prod_preds"),
+            "e14_cascade_preds": e14.get("cascade_preds"),
+            "e14_delta_preds": e14.get("delta_preds"),
+            "e14_prod_edges": e14.get("prod_edges"),
+            "e14_cascade_edges": e14.get("cascade_edges"),
+            "e14_delta_edges": e14.get("delta_edges"),
+        })
+        save(results)
+
+    # Sub-sample B — all three conditions
+    print("\nSub-sample B — all three conditions")
+    for item in SUBSAMPLE_B:
+        name = item["name"]
+        if name in done_b:
+            print(f"  SKIP (done): {name}")
+            continue
+        print(f"\n  {name} ({item['bucket']})")
+        doc_text = get_document_text(name)
+        if not doc_text:
+            print(f"  SKIP — no text")
+            continue
+
+        entry = {"name": name, "bucket": item["bucket"],
+                 "taxfree_metadata": None, "standard_metadata": None}
+
+        # Baseline
+        try:
+            ingest(name, doc_text, "", GROUP_BASELINE)
+            time.sleep(3)
+            print(f"  baseline ingested")
+        except Exception as e:
+            print(f"  baseline failed: {e}")
+
+        # Standard
+        std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
+        entry["standard_metadata"] = std_meta
+        try:
+            ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
+            time.sleep(3)
+            print(f"  standard ingested, domain_class={std_meta.get('domain_class','?')}")
+        except Exception as e:
+            print(f"  standard failed: {e}")
+
+        # Taxonomy-free
+        tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
+        entry["taxfree_metadata"] = tf_meta
+        print(f"  frames: {tf_meta.get('active_frames', 'ERROR')}")
+        try:
+            ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
+            time.sleep(3)
+            print(f"  taxfree ingested")
+        except Exception as e:
+            print(f"  taxfree failed: {e}")
+
+        results["subsample_b"].append(entry)
+        save(results)
+
+    print("\n" + "=" * 60)
+    print(f"Ingest complete. Results at {RESULTS_PATH}")
+    print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")
+
+
+if __name__ == "__main__":
+    run()