add experiment scripts and results; watcher.py latest changes
This commit is contained in:
@@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E1.8 Phase 1 — Ingest
|
||||
Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
|
||||
Run this first, then run e1_8_eval.py to pull predicate counts.
|
||||
"""
|
||||
|
||||
import os, json, time, psycopg2, requests
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
||||
|
||||
GROUP_TAXFREE = "aaron_e18_taxfree"
|
||||
GROUP_BASELINE = "aaron_e18_baseline"
|
||||
GROUP_STANDARD = "aaron_e18_standard"
|
||||
|
||||
TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.
|
||||
|
||||
Do not summarize content. Do not extract entities. Do not assign a single category label.
|
||||
|
||||
Instead, describe:
|
||||
- What domains or frames are active in this content (there may be several simultaneously)
|
||||
- How those frames relate to each other in this specific document
|
||||
- What kind of relational content a knowledge graph extractor should look for
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"active_frames": ["<frame 1>", "<frame 2>", ...],
|
||||
"frame_relationships": "<one sentence describing how the frames interact in this document>",
|
||||
"extraction_orientation": "<one sentence orienting the extractor toward the most relationship-rich content>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
SUBSAMPLE_A = [
|
||||
{"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
|
||||
{"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
|
||||
{"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
|
||||
{"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
|
||||
{"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
|
||||
{"name": "Claude: Research Statement Restructure", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
|
||||
{"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
|
||||
{"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
|
||||
{"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
|
||||
{"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
|
||||
{"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
|
||||
]
|
||||
|
||||
SUBSAMPLE_B = [
|
||||
{"name": "ChatGPT: Job application comparison", "bucket": "high"},
|
||||
{"name": "ChatGPT: External review for tenure", "bucket": "high"},
|
||||
{"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
|
||||
{"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
|
||||
{"name": "ChatGPT: Analyze business plan", "bucket": "high"},
|
||||
{"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
|
||||
{"name": "NO thesis proposal.pdf", "bucket": "document"},
|
||||
{"name": "PWM.pdf", "bucket": "document"},
|
||||
{"name": "Will_It_Print.pdf", "bucket": "document"},
|
||||
{"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
|
||||
{"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
|
||||
]
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def get_document_text(source_name):
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
|
||||
rows = cur.fetchall()
|
||||
pg.close()
|
||||
return " ".join(r[0] for r in rows)[:12000]
|
||||
|
||||
|
||||
def run_mistral(prompt_prefix, doc_text, label=""):
|
||||
print(f" → Mistral {label} running...", flush=True)
|
||||
payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
|
||||
resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
|
||||
resp.raise_for_status()
|
||||
raw = resp.json().get("response", "{}")
|
||||
print(f" → Mistral {label} done ({len(raw)} chars)", flush=True)
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return {"error": "parse_failed", "raw": raw[:200]}
|
||||
|
||||
|
||||
def build_taxfree_orientation(meta):
|
||||
frames = ", ".join(meta.get("active_frames", []))
|
||||
rel = meta.get("frame_relationships", "")
|
||||
orient = meta.get("extraction_orientation", "")
|
||||
summary = meta.get("one_sentence_summary", "")
|
||||
return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"
|
||||
|
||||
|
||||
def build_standard_orientation(meta):
|
||||
dc = meta.get("domain_class", "unknown")
|
||||
pf = meta.get("primary_format", "unknown")
|
||||
summary = meta.get("one_sentence_summary", "")
|
||||
cs = meta.get("content_signals", {})
|
||||
return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
|
||||
f"has_named_people: {cs.get('has_named_people', False)}\n"
|
||||
f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")
|
||||
|
||||
|
||||
def ingest(source_name, doc_text, orientation, group_id):
|
||||
payload = {
|
||||
"episodes": [{
|
||||
"name": source_name,
|
||||
"content": doc_text[:12000],
|
||||
"source_description": orientation,
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}],
|
||||
"group_id": group_id,
|
||||
}
|
||||
resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
|
||||
resp.raise_for_status()
|
||||
|
||||
|
||||
def save(results):
|
||||
RESULTS_PATH.write_text(json.dumps(results, indent=2))
|
||||
|
||||
|
||||
def run():
|
||||
print("E1.8 — Ingest phase")
|
||||
print("=" * 60)
|
||||
|
||||
# Load existing results if resuming
|
||||
if RESULTS_PATH.exists():
|
||||
results = json.loads(RESULTS_PATH.read_text())
|
||||
done_a = {r["name"] for r in results.get("subsample_a", [])}
|
||||
done_b = {r["name"] for r in results.get("subsample_b", [])}
|
||||
print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
|
||||
else:
|
||||
results = {"subsample_a": [], "subsample_b": []}
|
||||
done_a, done_b = set(), set()
|
||||
|
||||
e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
|
||||
e14_by_name = {s["name"]: s for s in e14_data}
|
||||
|
||||
# Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
|
||||
print("\nSub-sample A — taxonomy-free ingestion only")
|
||||
for item in SUBSAMPLE_A:
|
||||
name = item["name"]
|
||||
if name in done_a:
|
||||
print(f" SKIP (done): {name}")
|
||||
continue
|
||||
print(f"\n {name}")
|
||||
doc_text = get_document_text(name)
|
||||
if not doc_text:
|
||||
print(f" SKIP — no text")
|
||||
continue
|
||||
|
||||
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
|
||||
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
|
||||
orientation = build_taxfree_orientation(tf_meta)
|
||||
|
||||
try:
|
||||
ingest(name, doc_text, orientation, GROUP_TAXFREE)
|
||||
time.sleep(3)
|
||||
print(f" ingested to {GROUP_TAXFREE}")
|
||||
except Exception as e:
|
||||
print(f" ingest failed: {e}")
|
||||
continue
|
||||
|
||||
e14 = e14_by_name.get(name, {})
|
||||
results["subsample_a"].append({
|
||||
"name": name,
|
||||
"bucket": item["bucket"],
|
||||
"taxfree_metadata": tf_meta,
|
||||
"taxfree_orientation": orientation,
|
||||
"e14_prod_preds": e14.get("prod_preds"),
|
||||
"e14_cascade_preds": e14.get("cascade_preds"),
|
||||
"e14_delta_preds": e14.get("delta_preds"),
|
||||
"e14_prod_edges": e14.get("prod_edges"),
|
||||
"e14_cascade_edges": e14.get("cascade_edges"),
|
||||
"e14_delta_edges": e14.get("delta_edges"),
|
||||
})
|
||||
save(results)
|
||||
|
||||
# Sub-sample B — all three conditions
|
||||
print("\nSub-sample B — all three conditions")
|
||||
for item in SUBSAMPLE_B:
|
||||
name = item["name"]
|
||||
if name in done_b:
|
||||
print(f" SKIP (done): {name}")
|
||||
continue
|
||||
print(f"\n {name} ({item['bucket']})")
|
||||
doc_text = get_document_text(name)
|
||||
if not doc_text:
|
||||
print(f" SKIP — no text")
|
||||
continue
|
||||
|
||||
entry = {"name": name, "bucket": item["bucket"],
|
||||
"taxfree_metadata": None, "standard_metadata": None}
|
||||
|
||||
# Baseline
|
||||
try:
|
||||
ingest(name, doc_text, "", GROUP_BASELINE)
|
||||
time.sleep(3)
|
||||
print(f" baseline ingested")
|
||||
except Exception as e:
|
||||
print(f" baseline failed: {e}")
|
||||
|
||||
# Standard
|
||||
std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
|
||||
entry["standard_metadata"] = std_meta
|
||||
try:
|
||||
ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
|
||||
time.sleep(3)
|
||||
print(f" standard ingested, domain_class={std_meta.get('domain_class','?')}")
|
||||
except Exception as e:
|
||||
print(f" standard failed: {e}")
|
||||
|
||||
# Taxonomy-free
|
||||
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
|
||||
entry["taxfree_metadata"] = tf_meta
|
||||
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
|
||||
try:
|
||||
ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
|
||||
time.sleep(3)
|
||||
print(f" taxfree ingested")
|
||||
except Exception as e:
|
||||
print(f" taxfree failed: {e}")
|
||||
|
||||
results["subsample_b"].append(entry)
|
||||
save(results)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Ingest complete. Results at {RESULTS_PATH}")
|
||||
print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user