add experiment scripts and results; watcher.py latest changes
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 orchestration — fetch source text, run Mistral metadata, submit to Graphiti test group_id."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_test"
|
||||
MAX_DOC_CHARS = 12000 # Same cap as Tier 1 for parity
|
||||
|
||||
# Stage 2 metadata prompt — verbatim from stage-2-worker-spec.md
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
"""Reassemble the full document from pgvector chunks, mirroring tier1_migration.py logic."""
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text):
|
||||
"""Call local Mistral via Ollama for base-class metadata."""
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=180,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
# Override char_length with python-computed value (per stage-2-worker-spec)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
"""Format the base-class metadata as a source_description for Graphiti, with orient-not-bound framing."""
|
||||
if "error" in metadata:
|
||||
return f"tier1_cascade_test (metadata generation failed: {metadata['error']})"
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode(name, content, source_description):
|
||||
"""Submit episode to Graphiti sidecar at the test group_id."""
|
||||
payload = {
|
||||
"episodes": [{
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": source_description,
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}],
|
||||
"group_id": TEST_GROUP_ID,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes/bulk", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
print(f"E1 cascade re-extraction starting — {len(selected)} episodes to test group_id={TEST_GROUP_ID}\n")
|
||||
|
||||
results = []
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
|
||||
# Fetch text
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
# Mistral metadata
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
# Submit to Graphiti
|
||||
source_desc = format_metadata_as_orientation(metadata)
|
||||
record["source_description"] = source_desc
|
||||
print(f" Submitting to Graphiti test group...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode(name, text, source_desc)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
# Save intermediate state after each episode
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user