#!/usr/bin/env python3
"""
E1.8 Phase 1 — Ingest
Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
Run this first, then run e1_8_eval.py to pull predicate counts.
"""
import os, json, time, psycopg2, requests
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
PG_DSN = os.getenv("PG_DSN")
GRAPHITI_URL = "http://localhost:8001"
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
GROUP_TAXFREE = "aaron_e18_taxfree"
GROUP_BASELINE = "aaron_e18_baseline"
GROUP_STANDARD = "aaron_e18_standard"
TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.
Do not summarize content. Do not extract entities. Do not assign a single category label.
Instead, describe:
- What domains or frames are active in this content (there may be several simultaneously)
- How those frames relate to each other in this specific document
- What kind of relational content a knowledge graph extractor should look for
Output JSON only. No prose, no explanation, no markdown.
Schema:
{
"active_frames": ["", "", ...],
"frame_relationships": "",
"extraction_orientation": "",
"one_sentence_summary": ""
}
Document:
"""
STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "",
"char_length": ,
"primary_format": "",
"structural_signals": {
"has_headings": ,
"has_bullet_lists": ,
"has_numbered_lists": ,
"has_tables": ,
"has_code_blocks": ,
"has_dates":
},
"content_signals": {
"has_named_people": ,
"has_institutional_language": ,
"has_technical_terminology": ,
"has_first_person": ,
"has_quotations":
},
"domain_class": "",
"one_sentence_summary": ""
}
Document:
"""
SUBSAMPLE_A = [
{"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
{"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
{"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
{"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
{"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
{"name": "Claude: Research Statement Restructure", "bucket": "mid"},
{"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
{"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
{"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
{"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
{"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
{"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
{"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
]
SUBSAMPLE_B = [
{"name": "ChatGPT: Job application comparison", "bucket": "high"},
{"name": "ChatGPT: External review for tenure", "bucket": "high"},
{"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
{"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
{"name": "ChatGPT: Analyze business plan", "bucket": "high"},
{"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
{"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
{"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
{"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
{"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
{"name": "NO thesis proposal.pdf", "bucket": "document"},
{"name": "PWM.pdf", "bucket": "document"},
{"name": "Will_It_Print.pdf", "bucket": "document"},
{"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
{"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
]
def get_pg():
return psycopg2.connect(PG_DSN)
def get_document_text(source_name):
pg = get_pg()
cur = pg.cursor()
cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
rows = cur.fetchall()
pg.close()
return " ".join(r[0] for r in rows)[:12000]
def run_mistral(prompt_prefix, doc_text, label=""):
print(f" → Mistral {label} running...", flush=True)
payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
resp.raise_for_status()
raw = resp.json().get("response", "{}")
print(f" → Mistral {label} done ({len(raw)} chars)", flush=True)
try:
return json.loads(raw)
except Exception:
return {"error": "parse_failed", "raw": raw[:200]}
def build_taxfree_orientation(meta):
frames = ", ".join(meta.get("active_frames", []))
rel = meta.get("frame_relationships", "")
orient = meta.get("extraction_orientation", "")
summary = meta.get("one_sentence_summary", "")
return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"
def build_standard_orientation(meta):
dc = meta.get("domain_class", "unknown")
pf = meta.get("primary_format", "unknown")
summary = meta.get("one_sentence_summary", "")
cs = meta.get("content_signals", {})
return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
f"has_named_people: {cs.get('has_named_people', False)}\n"
f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")
def ingest(source_name, doc_text, orientation, group_id):
payload = {
"episodes": [{
"name": source_name,
"content": doc_text[:12000],
"source_description": orientation,
"timestamp": "2026-04-28T00:00:00",
}],
"group_id": group_id,
}
resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
resp.raise_for_status()
def save(results):
RESULTS_PATH.write_text(json.dumps(results, indent=2))
def run():
print("E1.8 — Ingest phase")
print("=" * 60)
# Load existing results if resuming
if RESULTS_PATH.exists():
results = json.loads(RESULTS_PATH.read_text())
done_a = {r["name"] for r in results.get("subsample_a", [])}
done_b = {r["name"] for r in results.get("subsample_b", [])}
print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
else:
results = {"subsample_a": [], "subsample_b": []}
done_a, done_b = set(), set()
e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
e14_by_name = {s["name"]: s for s in e14_data}
# Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
print("\nSub-sample A — taxonomy-free ingestion only")
for item in SUBSAMPLE_A:
name = item["name"]
if name in done_a:
print(f" SKIP (done): {name}")
continue
print(f"\n {name}")
doc_text = get_document_text(name)
if not doc_text:
print(f" SKIP — no text")
continue
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
orientation = build_taxfree_orientation(tf_meta)
try:
ingest(name, doc_text, orientation, GROUP_TAXFREE)
time.sleep(3)
print(f" ingested to {GROUP_TAXFREE}")
except Exception as e:
print(f" ingest failed: {e}")
continue
e14 = e14_by_name.get(name, {})
results["subsample_a"].append({
"name": name,
"bucket": item["bucket"],
"taxfree_metadata": tf_meta,
"taxfree_orientation": orientation,
"e14_prod_preds": e14.get("prod_preds"),
"e14_cascade_preds": e14.get("cascade_preds"),
"e14_delta_preds": e14.get("delta_preds"),
"e14_prod_edges": e14.get("prod_edges"),
"e14_cascade_edges": e14.get("cascade_edges"),
"e14_delta_edges": e14.get("delta_edges"),
})
save(results)
# Sub-sample B — all three conditions
print("\nSub-sample B — all three conditions")
for item in SUBSAMPLE_B:
name = item["name"]
if name in done_b:
print(f" SKIP (done): {name}")
continue
print(f"\n {name} ({item['bucket']})")
doc_text = get_document_text(name)
if not doc_text:
print(f" SKIP — no text")
continue
entry = {"name": name, "bucket": item["bucket"],
"taxfree_metadata": None, "standard_metadata": None}
# Baseline
try:
ingest(name, doc_text, "", GROUP_BASELINE)
time.sleep(3)
print(f" baseline ingested")
except Exception as e:
print(f" baseline failed: {e}")
# Standard
std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
entry["standard_metadata"] = std_meta
try:
ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
time.sleep(3)
print(f" standard ingested, domain_class={std_meta.get('domain_class','?')}")
except Exception as e:
print(f" standard failed: {e}")
# Taxonomy-free
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
entry["taxfree_metadata"] = tf_meta
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
try:
ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
time.sleep(3)
print(f" taxfree ingested")
except Exception as e:
print(f" taxfree failed: {e}")
results["subsample_b"].append(entry)
save(results)
print("\n" + "=" * 60)
print(f"Ingest complete. Results at {RESULTS_PATH}")
print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")
if __name__ == "__main__":
run()