add experiment scripts and results; watcher.py latest changes

This commit is contained in:
2026-04-30 18:06:03 +00:00
parent 1cf26df450
commit f11cacd9c9
55 changed files with 23594 additions and 726 deletions
+193
View File
@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
Audit Expansion Pack Generator — type-aware stratified draw of 12
documents from base_class_validation_results.json for n=20 audit expansion.
Per audit-expansion-protocol.md amendment 2026-04-28:
The seed=43 length-only random draw concentrated on course modules in the
small and medium buckets, missing voice captures, syllabi, and
conversational documents present in the candidate distribution.
This script implements type-aware stratification within each length
bucket to produce a sample representative of BirdAI's document-type mix.
Targets (12 total):
small (4): 2 course_module + 2 voice_capture
medium (4): 2 course_module + 1 syllabus + 1 other
large (4): 1 course_ppt + 1 syllabus + 1 faculty_report + 1 conversational
Output: ~/aaronai/experiments/audit_expansion_pack.json
Usage:
python3 ~/aaronai/scripts/audit_expansion_draw.py
python3 ~/aaronai/scripts/audit_expansion_draw.py --dry-run
"""
import argparse
import json
import random
import re
import sys
import time
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
VALIDATION_RESULTS = EXPERIMENTS / "base_class_validation_results.json"
EXISTING_AUDIT_PACK = EXPERIMENTS / "base_class_audit_pack.json"
OUTPUT_FILE = EXPERIMENTS / "audit_expansion_pack.json"
SEED = 43
# Type-aware targets per bucket
TYPE_TARGETS = {
"small": {"course_module": 2, "voice_capture": 2},
"medium": {"course_module": 2, "syllabus": 1, "other": 1},
"large": {"course_ppt": 1, "syllabus": 1, "faculty_report": 1, "conversational": 1},
}
def classify(source, bucket):
"""Map a source filename to a document type, scoped to bucket where
type categories overlap (e.g., 'course_module' vs 'course_ppt')."""
s = source.lower()
# Voice captures — pattern: YYYY-MM-DD-HH-MM-voice.md
if re.match(r"\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-voice\.md$", source):
return "voice_capture"
# Conversational exports — pattern: "Claude: ..." or "ChatGPT: ..."
if source.startswith("Claude:") or source.startswith("ChatGPT:"):
return "conversational"
# Syllabus — must contain "syllabus" in the name
if "syllabus" in s:
return "syllabus"
# Faculty / annual reports
if "faculty report" in s or "annual report" in s:
return "faculty_report"
# Course PPTs (large bucket) — pattern: "_PPT_" or "_v3.pptx" or "Mod0N_"
if bucket == "large" and (".pptx" in s or "_ppt_" in s or re.match(r"mod\d+_", s)):
return "course_ppt"
# Course modules (small/medium bucket) — pattern: "0N_*.docx" or numeric prefix
if re.match(r"^\d{2}_", source):
return "course_module"
# Everything else falls into 'other' for medium; not used in small/large targets
return "other"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not VALIDATION_RESULTS.exists():
print(f"ERROR: {VALIDATION_RESULTS} not found", file=sys.stderr)
sys.exit(1)
with open(VALIDATION_RESULTS) as f:
validation = json.load(f)
all_docs = validation["results"]
print(f"Loaded {len(all_docs)} documents from validation results")
print(f"Experiment: {validation.get('title', 'unknown')}")
# Load existing audit pack to exclude its sources (audit pack uses 'pairs')
excluded_sources = set()
if EXISTING_AUDIT_PACK.exists():
with open(EXISTING_AUDIT_PACK) as f:
existing = json.load(f)
existing_pairs = existing.get("pairs", existing.get("results", existing))
for doc in existing_pairs:
src = doc.get("source")
if src:
excluded_sources.add(src)
print(f"Excluding {len(excluded_sources)} sources already in audit pack")
# Filter to valid candidates
valid_docs = []
for doc in all_docs:
src = doc.get("source")
if src in excluded_sources:
continue
if not doc.get("condition_a") or not doc.get("condition_b"):
continue
bucket = doc.get("size_bucket")
if bucket not in TYPE_TARGETS:
continue
doc["_type"] = classify(src, bucket)
valid_docs.append(doc)
print(f"Valid candidate documents: {len(valid_docs)}")
# Print what's available per (bucket, type) before drawing
print(f"\nCandidates by (bucket, type):")
for bucket in TYPE_TARGETS:
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
types_in_bucket = {}
for d in bucket_docs:
types_in_bucket.setdefault(d["_type"], []).append(d)
print(f" {bucket}:")
for t in sorted(types_in_bucket.keys()):
target = TYPE_TARGETS[bucket].get(t, "")
print(f" {t:>16}: {len(types_in_bucket[t])} avail, target {target}")
# Stratified type-aware draw
random.seed(SEED)
drawn = []
warnings = []
for bucket, type_targets in TYPE_TARGETS.items():
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
for doc_type, target in type_targets.items():
type_docs = [d for d in bucket_docs if d["_type"] == doc_type]
if len(type_docs) < target:
msg = (f"WARNING: bucket={bucket} type={doc_type} "
f"available={len(type_docs)} target={target}")
warnings.append(msg)
print(msg, file=sys.stderr)
n_to_draw = min(target, len(type_docs))
sample = random.sample(type_docs, n_to_draw)
drawn.extend(sample)
# Report draw
print(f"\nDrew {len(drawn)} documents:")
for d in drawn:
src = d.get("source", "<unknown>")
chars = d.get("doc_chars_original", 0)
bucket = d.get("size_bucket", "?")
doc_type = d.get("_type", "?")
truncated = " (TRUNCATED)" if d.get("truncated") else ""
print(f" [{bucket:>6}/{doc_type:>16}] {chars:>6}c {src}{truncated}")
# Bucket-level summary
bucket_counts = {"small": 0, "medium": 0, "large": 0}
for d in drawn:
bucket_counts[d["size_bucket"]] += 1
print(f"\nBucket totals: {bucket_counts}")
if args.dry_run:
print(f"\n--dry-run set, not writing output file")
return
output = {
"metadata": {
"generated_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
"source_validation_file": str(VALIDATION_RESULTS),
"seed": SEED,
"stratification": "type-aware within length bucket",
"type_targets": TYPE_TARGETS,
"bucket_counts": bucket_counts,
"excluded_count": len(excluded_sources),
"warnings": warnings,
"purpose": "n=20 audit expansion per audit-expansion-protocol.md (type-aware amendment)",
},
"results": drawn,
}
with open(OUTPUT_FILE, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"\nWrote {OUTPUT_FILE}")
print(f" {len(drawn)} documents ready for rating")
if __name__ == "__main__":
main()
+605
View File
@@ -0,0 +1,605 @@
#!/usr/bin/env python3
"""
Base-Class Enrichment Test — OOP Framing Experiment
Tests whether non-entity metadata from a local model (domain class, structural
signals, presence flags, length, summary) can take load off the API without
constraining what it extracts.
The local model does NOT draft entities. The API still does full extraction.
The local model produces metadata that orients the API's reading.
Conditions:
A — Baseline: single Claude Haiku call, full extraction, no metadata
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
If B produces fewer edges or less predicate diversity, metadata is acting as
constraint and the OOP framing is falsified.
Sample: 50 docs from briefing_test_v2_results.json:
- 15 small (<1000 chars)
- 25 medium (1000-5000 chars)
- 10 large (5000-12000 chars, capped at 12K)
Outputs: ~/aaronai/experiments/base_class_audit_rerun_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_audit_rerun_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 8192
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180
MAX_DOC_CHARS = 12000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
Return ONLY valid JSON with this exact schema:
{
"language": "en or other",
"char_length": integer,
"primary_format": "prose, presentation, list, form, code, or mixed",
"structural_signals": {
"has_headings": boolean,
"has_bullet_lists": boolean,
"has_numbered_lists": boolean,
"has_tables": boolean,
"has_code_blocks": boolean,
"has_dates": boolean
},
"content_signals": {
"has_named_people": boolean,
"has_institutional_language": boolean,
"has_technical_terminology": boolean,
"has_first_person": boolean,
"has_quotations": boolean
},
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
}
JSON only, no commentary.
DOCUMENT:
"""
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
DOCUMENT METADATA:
{metadata_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local_metadata(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_METADATA_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph_full(raw):
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None, False
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None, False
if not isinstance(data, dict):
return None, None, False
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return ents, edges, True
return None, None, False
def parse_metadata(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
return json.loads(cleaned)
except json.JSONDecodeError:
return None
def graph_metrics(entities, edges):
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
if entities is None or edges is None:
return None
n_entities = len(entities)
n_edges = len(edges)
# Predicate diversity
predicates = set()
for e in edges:
if isinstance(e, dict):
p = e.get("predicate")
if p:
predicates.add(str(p).strip().lower())
predicate_diversity = len(predicates)
# Entity type diversity
types = set()
for ent in entities:
if isinstance(ent, dict):
t = ent.get("type")
if t:
types.add(str(t).strip().lower())
type_diversity = len(types)
# Average degree (edges*2 / entities — each edge touches two nodes)
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
# Largest connected component
# Build adjacency from edges
entity_names = set()
for ent in entities:
if isinstance(ent, dict):
n = ent.get("name")
if n:
entity_names.add(str(n).strip().lower())
adj = {name: set() for name in entity_names}
for e in edges:
if not isinstance(e, dict):
continue
s = str(e.get("subject", "")).strip().lower()
o = str(e.get("object", "")).strip().lower()
if s in adj and o in adj:
adj[s].add(o)
adj[o].add(s)
# BFS for largest component
visited = set()
largest = 0
for start in adj:
if start in visited:
continue
component = 0
stack = [start]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component += 1
for neighbor in adj[node]:
if neighbor not in visited:
stack.append(neighbor)
if component > largest:
largest = component
return {
"n_entities": n_entities,
"n_edges": n_edges,
"predicate_diversity": predicate_diversity,
"type_diversity": type_diversity,
"avg_degree": round(avg_degree, 2),
"largest_component": largest,
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
}
def stratify(docs):
"""Audit re-run: load the 10 audit docs from base_class_audit_pack.json."""
import json as _json
audit_file = Path.home() / "aaronai" / "experiments" / "base_class_audit_pack.json"
if not audit_file.exists():
print(f"ERROR: {audit_file} not found")
return []
audit = _json.loads(audit_file.read_text())
audit_sources = [p["source"] for p in audit["pairs"]]
# Synthesize doc_meta entries for the audit sources
sample = [{"source": s, "content_length": 0, "status": "SUCCESS"}
for s in audit_sources]
print(f"Audit re-run: {len(sample)} docs from base_class_audit_pack.json")
return sample
def fmt_metrics(m):
if m is None:
return "n/a"
return (f"e={m['n_entities']} edge={m['n_edges']} "
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
print(f" {fmt_metrics(a_metrics)}", flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_metrics = None
# Condition B local metadata pass
local_result = call_local_metadata(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
metadata = parse_metadata(local_raw)
# Override LLM-hallucinated char_length with Python-computed truth
if metadata is not None and isinstance(metadata, dict):
metadata["char_length"] = len(doc_text)
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
flush=True)
if metadata is None:
print(f" B: metadata parse failed — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "metadata_parse_failed",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
print(f" {fmt_metrics(b_metrics)}", flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_metrics = None
# Per-doc deltas
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
pred_pct_str = "n/a"
if a_metrics and b_metrics:
if a_metrics["n_edges"] > 0:
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
if a_metrics["predicate_diversity"] > 0:
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_metadata": metadata,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"metrics": b_metrics,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:32000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("metrics") is not None
and r.get("condition_b", {}).get("metrics") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
def avg_metric(rows, condition, key):
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
return round(statistics.mean(vals), 2) if vals else None
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
by_bucket[bucket] = {
"n": len(rows),
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
}
summary = {
"experiment": "base_class_test",
"title": "Base-Class Enrichment — OOP Framing",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By bucket — graph metrics (A vs B):")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}):")
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
print()
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+593
View File
@@ -0,0 +1,593 @@
#!/usr/bin/env python3
"""
Base-Class Enrichment Test — OOP Framing Experiment
Tests whether non-entity metadata from a local model (domain class, structural
signals, presence flags, length, summary) can take load off the API without
constraining what it extracts.
The local model does NOT draft entities. The API still does full extraction.
The local model produces metadata that orients the API's reading.
Conditions:
A — Baseline: single Claude Haiku call, full extraction, no metadata
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
If B produces fewer edges or less predicate diversity, metadata is acting as
constraint and the OOP framing is falsified.
Sample: 20 docs from briefing_test_v2_results.json:
- 5 small (<1000 chars)
- 10 medium (1000-5000 chars)
- 5 large (5000-12000 chars, capped at 12K)
Outputs: ~/aaronai/experiments/base_class_test_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_test_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 4096
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180
MAX_DOC_CHARS = 12000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
Return ONLY valid JSON with this exact schema:
{
"language": "en or other",
"char_length": integer,
"primary_format": "prose, presentation, list, form, code, or mixed",
"structural_signals": {
"has_headings": boolean,
"has_bullet_lists": boolean,
"has_numbered_lists": boolean,
"has_tables": boolean,
"has_code_blocks": boolean,
"has_dates": boolean
},
"content_signals": {
"has_named_people": boolean,
"has_institutional_language": boolean,
"has_technical_terminology": boolean,
"has_first_person": boolean,
"has_quotations": boolean
},
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
}
JSON only, no commentary.
DOCUMENT:
"""
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
DOCUMENT METADATA:
{metadata_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local_metadata(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_METADATA_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph_full(raw):
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None, False
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None, False
if not isinstance(data, dict):
return None, None, False
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return ents, edges, True
return None, None, False
def parse_metadata(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
return json.loads(cleaned)
except json.JSONDecodeError:
return None
def graph_metrics(entities, edges):
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
if entities is None or edges is None:
return None
n_entities = len(entities)
n_edges = len(edges)
# Predicate diversity
predicates = set()
for e in edges:
if isinstance(e, dict):
p = e.get("predicate")
if p:
predicates.add(str(p).strip().lower())
predicate_diversity = len(predicates)
# Entity type diversity
types = set()
for ent in entities:
if isinstance(ent, dict):
t = ent.get("type")
if t:
types.add(str(t).strip().lower())
type_diversity = len(types)
# Average degree (edges*2 / entities — each edge touches two nodes)
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
# Largest connected component
# Build adjacency from edges
entity_names = set()
for ent in entities:
if isinstance(ent, dict):
n = ent.get("name")
if n:
entity_names.add(str(n).strip().lower())
adj = {name: set() for name in entity_names}
for e in edges:
if not isinstance(e, dict):
continue
s = str(e.get("subject", "")).strip().lower()
o = str(e.get("object", "")).strip().lower()
if s in adj and o in adj:
adj[s].add(o)
adj[o].add(s)
# BFS for largest component
visited = set()
largest = 0
for start in adj:
if start in visited:
continue
component = 0
stack = [start]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component += 1
for neighbor in adj[node]:
if neighbor not in visited:
stack.append(neighbor)
if component > largest:
largest = component
return {
"n_entities": n_entities,
"n_edges": n_edges,
"predicate_diversity": predicate_diversity,
"type_diversity": type_diversity,
"avg_degree": round(avg_degree, 2),
"largest_component": largest,
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
}
def stratify(docs):
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000]
medium = [d for d, n in sized if 1000 <= n < 5000]
large = [d for d, n in sized if n >= 5000]
return small[:5] + medium[:10] + large[:5]
def fmt_metrics(m):
if m is None:
return "n/a"
return (f"e={m['n_entities']} edge={m['n_edges']} "
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (5s/10m/5l, file order)")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
print(f" {fmt_metrics(a_metrics)}", flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_metrics = None
# Condition B local metadata pass
local_result = call_local_metadata(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
metadata = parse_metadata(local_raw)
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
flush=True)
if metadata is None:
print(f" B: metadata parse failed — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "metadata_parse_failed",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
print(f" {fmt_metrics(b_metrics)}", flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_metrics = None
# Per-doc deltas
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
pred_pct_str = "n/a"
if a_metrics and b_metrics:
if a_metrics["n_edges"] > 0:
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
if a_metrics["predicate_diversity"] > 0:
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_metadata": metadata,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"metrics": b_metrics,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:4000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("metrics") is not None
and r.get("condition_b", {}).get("metrics") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
def avg_metric(rows, condition, key):
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
return round(statistics.mean(vals), 2) if vals else None
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
by_bucket[bucket] = {
"n": len(rows),
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
}
summary = {
"experiment": "base_class_test",
"title": "Base-Class Enrichment — OOP Framing",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By bucket — graph metrics (A vs B):")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}):")
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
print()
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+611
View File
@@ -0,0 +1,611 @@
#!/usr/bin/env python3
"""
Base-Class Enrichment Test — OOP Framing Experiment
Tests whether non-entity metadata from a local model (domain class, structural
signals, presence flags, length, summary) can take load off the API without
constraining what it extracts.
The local model does NOT draft entities. The API still does full extraction.
The local model produces metadata that orients the API's reading.
Conditions:
A — Baseline: single Claude Haiku call, full extraction, no metadata
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
If B produces fewer edges or less predicate diversity, metadata is acting as
constraint and the OOP framing is falsified.
Sample: 50 docs from briefing_test_v2_results.json:
- 15 small (<1000 chars)
- 25 medium (1000-5000 chars)
- 10 large (5000-12000 chars, capped at 12K)
Outputs: ~/aaronai/experiments/base_class_validation_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_validation_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 8192
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180
MAX_DOC_CHARS = 12000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
Return ONLY valid JSON with this exact schema:
{
"language": "en or other",
"char_length": integer,
"primary_format": "prose, presentation, list, form, code, or mixed",
"structural_signals": {
"has_headings": boolean,
"has_bullet_lists": boolean,
"has_numbered_lists": boolean,
"has_tables": boolean,
"has_code_blocks": boolean,
"has_dates": boolean
},
"content_signals": {
"has_named_people": boolean,
"has_institutional_language": boolean,
"has_technical_terminology": boolean,
"has_first_person": boolean,
"has_quotations": boolean
},
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
}
JSON only, no commentary.
DOCUMENT:
"""
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
DOCUMENT METADATA:
{metadata_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local_metadata(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_METADATA_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph_full(raw):
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None, False
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None, False
if not isinstance(data, dict):
return None, None, False
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return ents, edges, True
return None, None, False
def parse_metadata(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
return json.loads(cleaned)
except json.JSONDecodeError:
return None
def graph_metrics(entities, edges):
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
if entities is None or edges is None:
return None
n_entities = len(entities)
n_edges = len(edges)
# Predicate diversity
predicates = set()
for e in edges:
if isinstance(e, dict):
p = e.get("predicate")
if p:
predicates.add(str(p).strip().lower())
predicate_diversity = len(predicates)
# Entity type diversity
types = set()
for ent in entities:
if isinstance(ent, dict):
t = ent.get("type")
if t:
types.add(str(t).strip().lower())
type_diversity = len(types)
# Average degree (edges*2 / entities — each edge touches two nodes)
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
# Largest connected component
# Build adjacency from edges
entity_names = set()
for ent in entities:
if isinstance(ent, dict):
n = ent.get("name")
if n:
entity_names.add(str(n).strip().lower())
adj = {name: set() for name in entity_names}
for e in edges:
if not isinstance(e, dict):
continue
s = str(e.get("subject", "")).strip().lower()
o = str(e.get("object", "")).strip().lower()
if s in adj and o in adj:
adj[s].add(o)
adj[o].add(s)
# BFS for largest component
visited = set()
largest = 0
for start in adj:
if start in visited:
continue
component = 0
stack = [start]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component += 1
for neighbor in adj[node]:
if neighbor not in visited:
stack.append(neighbor)
if component > largest:
largest = component
return {
"n_entities": n_entities,
"n_edges": n_edges,
"predicate_diversity": predicate_diversity,
"type_diversity": type_diversity,
"avg_degree": round(avg_degree, 2),
"largest_component": largest,
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
}
def stratify(docs):
"""Pick small + medium from v2; large bucket is loaded separately from
large_bucket_sources.json (sampled fresh from pgvector since v2 has no large docs)."""
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000][:15]
medium = [d for d, n in sized if 1000 <= n < 5000][:25]
# Load large bucket from external sources file
import json as _json
large_sources_file = Path.home() / "aaronai" / "large_bucket_sources.json"
if large_sources_file.exists():
large_source_names = _json.loads(large_sources_file.read_text())
# Synthesize doc_meta entries for the large sources
large = [{"source": s, "content_length": 0, "status": "SUCCESS"}
for s in large_source_names]
print(f"Stratify: 15 small + 25 medium from v2, 10 large from large_bucket_sources.json")
else:
large = []
print("WARN: large_bucket_sources.json not found, no large docs in sample")
return small + medium + large
def fmt_metrics(m):
if m is None:
return "n/a"
return (f"e={m['n_entities']} edge={m['n_edges']} "
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
print(f" {fmt_metrics(a_metrics)}", flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_metrics = None
# Condition B local metadata pass
local_result = call_local_metadata(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
metadata = parse_metadata(local_raw)
# Override LLM-hallucinated char_length with Python-computed truth
if metadata is not None and isinstance(metadata, dict):
metadata["char_length"] = len(doc_text)
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
flush=True)
if metadata is None:
print(f" B: metadata parse failed — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "metadata_parse_failed",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
print(f" {fmt_metrics(b_metrics)}", flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_metrics = None
# Per-doc deltas
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
pred_pct_str = "n/a"
if a_metrics and b_metrics:
if a_metrics["n_edges"] > 0:
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
if a_metrics["predicate_diversity"] > 0:
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_metadata": metadata,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"metrics": b_metrics,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:32000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("metrics") is not None
and r.get("condition_b", {}).get("metrics") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
def avg_metric(rows, condition, key):
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
return round(statistics.mean(vals), 2) if vals else None
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
by_bucket[bucket] = {
"n": len(rows),
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
}
summary = {
"experiment": "base_class_test",
"title": "Base-Class Enrichment — OOP Framing",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By bucket — graph metrics (A vs B):")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}):")
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
print()
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+376
View File
@@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
BirdAI Briefing Generator v2 — Experiment 002b
===============================================
Changes from v1 (based on Experiment 004 human evaluation):
- document_type now pre-classified by rule, not by model
- Capture template header stripped before model sees content
- noise_signals constrained to controlled vocabulary
- Model prompt simplified — focuses only on reliable signal fields
- Expanded document type vocabulary for BirdAI-specific types
Results written to ~/aaronai/briefing_test_v2_results.json
"""
import json
import os
import re
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_v2_results.json")
MODEL = "mistral"
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
VALID_DOC_TYPES = {
"voice_capture", "image_capture",
"dream_nrem", "dream_rem", "dream_lucid", "dream_synthesis",
"presentation", "code", "spreadsheet",
"academic_pdf", "technical_doc", "chat_log",
"book_excerpt", "form", "syllabus", "email",
"notes", "purchase_order", "annual_report",
"invoice", "memo", "report", "unknown"
}
VALID_DENSITIES = {"high", "medium", "low"}
VALID_PRIORITIES = {"full", "partial", "skip"}
VALID_NOISE_SIGNALS = {
"repeated_headers", "page_numbers", "formatting_artifacts",
"boilerplate", "watermarks", "footers", "line_numbers",
"encoding_artifacts", "ocr_errors"
}
VALID_STRUCTURE_SIGNALS = {
"headings", "bullet_lists", "numbered_lists", "tables",
"code_blocks", "citations", "footnotes", "images",
"forms", "columns", "sections"
}
def pre_classify_document(source, content):
filename = os.path.basename(source).lower()
doc_type = None
cleaned_content = content
if "---" in content:
parts = content.split("---", 1)
header = parts[0].lower()
body = parts[1].strip() if len(parts) > 1 else content
if any(marker in header for marker in ["**type:**", "**modality:**", "# capture", "# dream"]):
cleaned_content = body if body else content
if "nrem" in filename:
doc_type = "dream_nrem"
elif "lucid" in filename:
doc_type = "dream_lucid"
elif "-rem-" in filename or filename.endswith("-rem.md"):
doc_type = "dream_rem"
elif "synthesis" in filename and filename.endswith(".md"):
doc_type = "dream_synthesis"
elif "-voice" in filename or "voice-" in filename:
doc_type = "voice_capture"
elif "-image" in filename or "image-" in filename:
doc_type = "image_capture"
elif filename.endswith(".pptx") or filename.endswith(".ppt"):
doc_type = "presentation"
elif filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".csv"):
doc_type = "spreadsheet"
elif any(filename.endswith(ext) for ext in [".py", ".js", ".ts", ".cpp", ".c", ".h", ".java", ".rs"]):
doc_type = "code"
elif filename.endswith("cmakelists.txt") or filename == "makefile":
doc_type = "code"
elif content.startswith("# Dream"):
if "nrem" in content[:50].lower():
doc_type = "dream_nrem"
elif "lucid" in content[:50].lower():
doc_type = "dream_lucid"
elif "rem" in content[:50].lower():
doc_type = "dream_rem"
else:
doc_type = "dream_synthesis"
elif content.startswith("# Capture"):
doc_type = "voice_capture" if "voice" in content[:100].lower() else "image_capture"
return doc_type, cleaned_content
def build_briefing_prompt(content, pre_classified_type=None):
if pre_classified_type:
type_instruction = f'\n "document_type": "{pre_classified_type}", // pre-classified, do not change'
else:
type_instruction = '\n "document_type": "one of: academic_pdf, technical_doc, chat_log, book_excerpt, form, syllabus, email, notes, purchase_order, annual_report, invoice, memo, report, unknown",'
return f"""Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
Return exactly this structure:
{{{type_instruction}
"primary_language": "language code e.g. en, fr, de",
"density": "one of: high, medium, low",
"has_proper_nouns": true or false,
"has_dates": true or false,
"has_numeric_data": true or false,
"has_institutional_language": true or false,
"has_technical_terms": true or false,
"likely_has_named_entities": true or false,
"structure_signals": [],
"noise_signals": [],
"extraction_priority": "one of: full, partial, skip"
}}
Rules:
- density: high=information dense technical or academic, medium=mixed, low=narrative/literary/sparse/short
- has_proper_nouns: true if you see capitalized words that are NOT sentence starts or template headers
- has_dates: true if you see date patterns (numbers with months, years, slashes)
- has_numeric_data: true if you see measurements, percentages, statistics
- has_institutional_language: true if you see words like university, department, policy, committee, grant
- has_technical_terms: true if you see domain-specific jargon or acronyms
- likely_has_named_entities: true if has_proper_nouns is true
- structure_signals: use ONLY these terms: headings, bullet_lists, numbered_lists, tables, code_blocks, citations, footnotes, images, forms, columns, sections
- noise_signals: use ONLY these terms: repeated_headers, page_numbers, formatting_artifacts, boilerplate, watermarks, footers, line_numbers, encoding_artifacts, ocr_errors
- extraction_priority: full if density=high and likely_has_named_entities=true; skip if density=low AND likely_has_named_entities=false AND content is under 200 words; partial otherwise
Document:
{content[:1500]}"""
def get_sample_documents():
if not PG_DSN:
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT DISTINCT ON (source) id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY source, random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_briefing(prompt):
payload = json.dumps({"model": MODEL, "prompt": prompt, "stream": False}).encode()
raw = ""
try:
req = urllib.request.Request(OLLAMA_URL, data=payload, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:200]}"
parsed = json.loads(raw[start:end])
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {raw[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def sanitize_briefing(briefing, pre_classified_type=None):
safe = {}
if pre_classified_type:
safe["document_type"] = pre_classified_type
else:
dt = str(briefing.get("document_type", "unknown")).lower().strip()
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
density = str(briefing.get("density", "medium")).lower().strip()
safe["density"] = density if density in VALID_DENSITIES else "medium"
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
"has_institutional_language", "has_technical_terms", "likely_has_named_entities"]:
val = briefing.get(field, False)
if isinstance(val, bool):
safe[field] = val
elif isinstance(val, str):
safe[field] = val.lower() in ("true", "yes", "1")
else:
safe[field] = bool(val)
for field, valid_set in [("structure_signals", VALID_STRUCTURE_SIGNALS),
("noise_signals", VALID_NOISE_SIGNALS)]:
val = briefing.get(field, [])
if isinstance(val, list):
safe[field] = [str(v).lower().strip() for v in val if str(v).lower().strip() in valid_set]
elif isinstance(val, str) and val.lower().strip() in valid_set:
safe[field] = [val.lower().strip()]
else:
safe[field] = []
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
return safe
def estimate_token_reduction(original_text, briefing):
original_tokens = max(len(original_text) / 4, 1)
orientation_saved = 200
if briefing.get("extraction_priority") == "skip":
return {"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": round(original_tokens + 200),
"noise_reduction_pct": 100.0, "total_reduction_pct": 100.0,
"note": "skip — no API call"}
noise_count = len(briefing.get("noise_signals", []))
noise_reduction_pct = min(noise_count * 0.05, 0.40)
noise_tokens_saved = original_tokens * noise_reduction_pct
total_saved = orientation_saved + noise_tokens_saved
reduction_pct = min((total_saved / (original_tokens + 200)) * 100, 99.0)
return {"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": orientation_saved,
"noise_tokens_saved": round(noise_tokens_saved),
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
"total_reduction_pct": round(reduction_pct, 1)}
def format_eta(elapsed_times, completed, total):
if completed == 0:
return "ETA: --:--"
avg = sum(elapsed_times) / completed
eta = timedelta(seconds=int((total - completed) * avg))
return f"ETA: {str(eta)}"
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
test_start = time.time()
print(f"\nBirdAI Briefing Generator v2 — Experiment 002b")
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
print(f"Changes: rule-based doc_type, template stripping, controlled vocab")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 75)
docs = get_sample_documents()
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
results = {
"meta": {"model": MODEL, "version": "v2", "sample_size": len(docs),
"started": datetime.now().isoformat(), "completed": None,
"total_elapsed_seconds": None, "avg_seconds_per_doc": None},
"documents": [], "summary": {}
}
success_count = 0
failed_count = 0
pre_classified_count = 0
priority_counts = {"full": 0, "partial": 0, "skip": 0}
total_reduction_pct = 0.0
elapsed_times = []
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
pre_type, cleaned_content = pre_classify_document(source, content)
was_pre_classified = pre_type is not None
if was_pre_classified:
pre_classified_count += 1
eta_str = format_eta(elapsed_times, i, len(docs))
pre_flag = "R" if was_pre_classified else "M"
print(f"[{i+1:02d}/{len(docs)}][{pre_flag}] {source[:36]:<36} {eta_str:<14}", end=" ", flush=True)
prompt = build_briefing_prompt(cleaned_content, pre_type)
t_start = time.time()
briefing, raw = run_briefing(prompt)
elapsed = round(time.time() - t_start, 1)
elapsed_times.append(elapsed)
if briefing is None:
failed_count += 1
print(f"→ FAILED {elapsed}s | {raw[:50]}")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "FAILED",
"pre_classified_type": pre_type, "error": raw, "elapsed_seconds": elapsed
})
else:
briefing = sanitize_briefing(briefing, pre_type)
success_count += 1
priority = briefing["extraction_priority"]
doc_type = briefing["document_type"]
density = briefing["density"]
priority_counts[priority] = priority_counts.get(priority, 0) + 1
reduction = estimate_token_reduction(cleaned_content, briefing)
total_reduction_pct += reduction["total_reduction_pct"]
print(f"{priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "cleaned_content_length": len(cleaned_content),
"status": "SUCCESS", "pre_classified_type": pre_type,
"was_pre_classified": was_pre_classified, "elapsed_seconds": elapsed,
"briefing": briefing, "token_reduction_estimate": reduction
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total_elapsed = round(time.time() - test_start, 1)
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
results["meta"]["total_elapsed_seconds"] = total_elapsed
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
total = len(docs)
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
summary = {
"total": total, "success": success_count, "failed": failed_count,
"success_rate": round(success_count / total * 100, 1),
"pre_classified_by_rule": pre_classified_count,
"classified_by_model": total - pre_classified_count,
"extraction_priority_breakdown": priority_counts,
"avg_token_reduction_pct": avg_reduction,
"total_elapsed_seconds": total_elapsed, "avg_seconds_per_doc": avg_per_doc,
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
"approach_viable": success_count / total >= 0.8
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 75)
print(f"RESULTS — Briefing Generator v2")
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
print(f" Failed: {failed_count}")
print(f" Pre-classified (rule): {pre_classified_count}")
print(f" Classified (model): {total - pre_classified_count}")
print(f" Priority — full: {priority_counts.get('full', 0)}")
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
print(f" Avg token reduction: {avg_reduction}%")
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
print(f" Avg per document: {avg_per_doc}s")
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 75)
if __name__ == "__main__":
main()
+508
View File
@@ -0,0 +1,508 @@
#!/usr/bin/env python3
"""
Cascade Optimization Test — skip-small + compressed-draft
Tests whether two optimizations on the entity-drafter cascade meaningfully
improve the savings ceiling beyond the prior unoptimized cascade (12.66%).
Optimizations:
A — Skip-small-docs routing: docs <1000 chars bypass the local pass entirely
B — Compressed draft format: bare JSON array instead of markdown bullets
Conditions:
A — Baseline: single Claude Haiku call, full extraction (unchanged from prior)
B — Optimized cascade: skip-small + compressed draft, otherwise same cascade
Sample: 30 docs from briefing_test_v2_results.json:
- 10 small (<1000 chars) — should show 0% delta if skip-small works
- 12 medium (1000-5000 chars) — primary test bucket
- 8 large (5000-12000 chars, capped at 12K)
Mistral context: 12K (raised from 8K in prior run).
Outputs: ~/aaronai/experiments/cascade_optimization_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_optimization_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 4096
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180 # raised — 12K context can take longer
MAX_DOC_CHARS = 12000 # raised from 8K
SKIP_SMALL_THRESHOLD = 1000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
Return ONLY valid JSON:
{
"candidates": [string]
}
Just names. No types, no relationships. JSON only.
DOCUMENT:
"""
# Compressed draft format — bare JSON array, minimal preamble
CONDITION_B_API_PROMPT_COMPRESSED = """Extract a knowledge graph from the document below.
Local model entity candidates (hint, not authoritative — verify against the document, ignore false ones, add missed ones):
{local_draft_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None
if not isinstance(data, dict):
return None, None
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return len(ents), len(edges)
return None, None
def parse_candidates(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None
if not isinstance(data, dict):
return None
cands = data.get("candidates")
if isinstance(cands, list):
return [str(c).strip() for c in cands if c]
return None
def stratify(docs):
"""Pick 10 small / 12 medium / 8 large by character length, in file order."""
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000]
medium = [d for d, n in sized if 1000 <= n < 5000]
large = [d for d, n in sized if n >= 5000]
return small[:10] + medium[:12] + large[:8]
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (10s/12m/8l, file order)")
print(f"Skip-small threshold: <{SKIP_SMALL_THRESHOLD} chars")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
skip_small_routed = sent_len < SKIP_SMALL_THRESHOLD
trunc_marker = "*" if truncated else " "
route_marker = "[skip-small]" if skip_small_routed else "[cascade] "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] "
f"{route_marker} {source[:50]}", flush=True)
# Condition A — always runs
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges = parse_graph(a["response_text"])
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_ents = a_edges = None
# Condition B
if skip_small_routed:
# Skip-small: B = A. Same call, no local pass.
print(f" B: routed to baseline (skip-small)", flush=True)
b = a
b_ents = a_ents
b_edges = a_edges
local_result = {"skipped": "skip_small_routed"}
local_candidates = []
local_raw = ""
else:
local_result = call_local(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']} — recording skip", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"skip_small_routed": False,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
cands = parse_candidates(local_raw)
local_candidates = cands or []
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
flush=True)
if not local_candidates:
print(f" B local: empty draft — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"skip_small_routed": False,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_draft_empty",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
# Compressed draft format — bare JSON array
local_draft_json = json.dumps(local_candidates, ensure_ascii=False)
b_prompt = CONDITION_B_API_PROMPT_COMPRESSED.replace("{local_draft_json}", local_draft_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges = parse_graph(b["response_text"])
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_ents = b_edges = None
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
if a_edges and b_edges is not None and a_edges > 0:
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"skip_small_routed": skip_small_routed,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skip_small_routed": skip_small_routed,
"local_latency_s": local_result.get("latency_s"),
"local_candidates": local_candidates,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"entity_count": b_ents,
"edge_count": b_edges,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:4000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("input_tokens") is not None
and r.get("condition_b", {}).get("api_input_tokens") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
skip_count = sum(1 for r in rows if r.get("skip_small_routed"))
by_bucket[bucket] = {
"n": len(rows),
"n_skip_small_routed": skip_count,
"n_cascade": len(rows) - skip_count,
"a_input_tokens": ai,
"a_output_tokens": ao,
"b_input_tokens": bi,
"b_output_tokens": bo,
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
}
summary = {
"experiment": "cascade_optimization_test",
"title": "Cascade Optimization — skip-small + compressed-draft",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"haiku_temperature": HAIKU_TEMPERATURE,
"haiku_max_tokens": HAIKU_MAX_TOKENS,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"skip_small_threshold": SKIP_SMALL_THRESHOLD,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"n_skipped": len(sample) - len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"prior_unoptimized_cascade_pct": -12.66,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
opt_delta = delta_pct - (-12.66)
print(f"Optimization delta vs prior cascade: {opt_delta:+.2f} points "
f"(prior was -12.66%)")
print()
print("By size bucket:")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}, skip={stats['n_skip_small_routed']}): "
f"in {stats['input_delta_pct']:+.1f}% "
f"out {stats['output_delta_pct']:+.1f}% "
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print()
print("Results: " + str(OUTPUT_FILE))
if __name__ == "__main__":
main()
+485
View File
@@ -0,0 +1,485 @@
#!/usr/bin/env python3
"""
Cascade Test — Nodes-vs-Edges Experiment
Tests whether splitting graph extraction into "local drafts entity candidates,
API verifies + draws edges" reduces total API cost vs single-shot full
extraction, while producing a comparable graph.
Two conditions per document:
A — Baseline: single Claude Haiku call, full extraction
B — Cascade: Mistral lists entity candidates, then Haiku does verify+edges
Both conditions:
- See the full document (parity-respecting)
- Use open entity type vocabulary (no fixed schema)
- Use natural-language predicates (no constrained relations)
- Same target output schema, same temperature
Sample: 20 docs from briefing_test_v2_results.json, stratified by char length.
Reports API cost only. Local Mistral time is recorded but not monetized
(ran on the VPS, no per-token API charge).
Outputs: ~/aaronai/experiments/cascade_test_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_test_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 4096
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 120
MAX_DOC_CHARS = 8000
# Verified pricing 2026-04-28 against Anthropic docs
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
Return ONLY valid JSON:
{
"candidates": [string]
}
Just names. No types, no relationships. JSON only.
DOCUMENT:
"""
CONDITION_B_API_PROMPT_WITH_DRAFT = """Extract a knowledge graph from the document below.
A local model has identified entity candidates that may help orient your reading. Treat the candidates as a hint, not as truth — verify each candidate appears in the document, ignore any that do not, and add any entities the candidates missed.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
ENTITY CANDIDATES FROM LOCAL MODEL:
{local_draft}
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 8192},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None
if not isinstance(data, dict):
return None, None
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return len(ents), len(edges)
return None, None
def parse_candidates(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None
if not isinstance(data, dict):
return None
cands = data.get("candidates")
if isinstance(cands, list):
return [str(c).strip() for c in cands if c]
return None
def stratify(docs):
"""Pick 5 small / 10 medium / 5 large by character length, in file order."""
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000]
medium = [d for d, n in sized if 1000 <= n < 5000]
large = [d for d, n in sized if n >= 5000]
return small[:5] + medium[:10] + large[:5]
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (stratified by char length, file order)")
for d in sample:
print(f" [{d['content_length']:>6}c] {d['source'][:60]}")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
print(f"Local model: {LOCAL_MODEL}")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:60]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges = parse_graph(a["response_text"])
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_ents = a_edges = None
# Condition B local pass
local_result = call_local(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']} — skipping doc", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
cands = parse_candidates(local_raw)
local_candidates = cands or []
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
flush=True)
if not local_candidates:
print(f" B local: empty draft — skipping API call to avoid asymmetric test", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_draft_empty",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
local_draft_str = "\n".join(f"- {c}" for c in local_candidates)
b_prompt = CONDITION_B_API_PROMPT_WITH_DRAFT.replace("{local_draft}", local_draft_str) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges = parse_graph(b["response_text"])
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_ents = b_edges = None
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
if a_edges and b_edges is not None and a_edges > 0:
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_candidates": local_candidates,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"entity_count": b_ents,
"edge_count": b_edges,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:4000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("input_tokens") is not None
and r.get("condition_b", {}).get("api_input_tokens") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
by_bucket[bucket] = {
"n": len(rows),
"a_input_tokens": ai,
"a_output_tokens": ao,
"b_input_tokens": bi,
"b_output_tokens": bo,
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
}
summary = {
"experiment": "cascade_test",
"title": "Nodes-vs-Edges Cascade Experiment",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"haiku_temperature": HAIKU_TEMPERATURE,
"haiku_max_tokens": HAIKU_MAX_TOKENS,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"n_skipped": len(sample) - len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By size bucket:")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}): "
f"in {stats['input_delta_pct']:+.1f}% "
f"out {stats['output_delta_pct']:+.1f}% "
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print()
print(f"NOTE: API cost only. Local Mistral runtime is not monetized.")
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+230
View File
@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Experiment 003 — Entity-Only Consistency Test
Three Mistral passes per document, measure consistency on entity fields only
(people, organizations, locations, dates). Excludes document_type label.
DISTINCT ON (source) sampling — fixes Exp 001 chunk-replacement flaw.
Outputs: ~/aaronai/experiments/consistency_test_v2_results.json
"""
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "consistency_test_v2_results.json"
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "mistral"
N_PASSES = 3
N_DOCS = 50
PER_CALL_TIMEOUT = 60 # seconds — fail fast, don't wedge
MAX_DOC_CHARS = 8000 # cap document length sent to Mistral
EXTRACTION_PROMPT = """Extract entities from the document below. Return ONLY valid JSON with this exact schema:
{
"people": [string],
"organizations": [string],
"locations": [string],
"dates": [string]
}
Rules:
- Only include entities you are CERTAIN about. If uncertain, omit.
- No prose, no markdown fences, no commentary. JSON only.
- Empty arrays are valid.
DOCUMENT:
"""
def call_mistral(document_text):
truncated = document_text[:MAX_DOC_CHARS]
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": MODEL,
"prompt": EXTRACTION_PROMPT + truncated,
"stream": False,
"format": "json",
"options": {"num_predict": 512},
},
timeout=PER_CALL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
"truncated": len(document_text) > MAX_DOC_CHARS,
}
except requests.exceptions.Timeout:
return {"error": f"timeout after {PER_CALL_TIMEOUT}s", "latency_s": PER_CALL_TIMEOUT}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_entities(raw_response):
text = (raw_response or "").strip()
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
try:
data = json.loads(text)
except json.JSONDecodeError:
return None
out = {}
for key in ("people", "organizations", "locations", "dates"):
vals = data.get(key, [])
if not isinstance(vals, list):
return None
out[key] = sorted(set(str(v).strip().lower() for v in vals if v))
return out
def entities_match(a, b):
if a is None or b is None:
return False
return all(a[k] == b[k] for k in ("people", "organizations", "locations", "dates"))
def fetch_distinct_sources(pg_conn, n):
cur = pg_conn.cursor()
cur.execute("""
SELECT source, string_agg(document, E'\n\n' ORDER BY id) AS doc
FROM embeddings
WHERE source IS NOT NULL
GROUP BY source
ORDER BY MIN(id)
LIMIT %s
""", (n,))
rows = cur.fetchall()
cur.close()
return [(s, d) for s, d in rows if d and len(d.strip()) > 50]
def main():
pg_dsn = os.environ.get("PG_DSN")
if not pg_dsn:
print("ERROR: PG_DSN not set", file=sys.stderr)
sys.exit(1)
pg_conn = psycopg2.connect(pg_dsn)
docs = fetch_distinct_sources(pg_conn, N_DOCS)
pg_conn.close()
print(f"Loaded {len(docs)} distinct sources from pgvector")
print(f"Model: {MODEL} | Passes per doc: {N_PASSES}")
print(f"Per-call timeout: {PER_CALL_TIMEOUT}s | Max doc chars: {MAX_DOC_CHARS}")
print(f"Calls planned: {len(docs) * N_PASSES}\n")
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, (source, doc_text) in enumerate(docs, 1):
size_marker = f"[{len(doc_text):>5}c]"
print(f"[{i:02d}/{len(docs)}] {size_marker} {source[:55]}", flush=True)
passes = []
for p in range(N_PASSES):
r = call_mistral(doc_text)
if "error" in r:
print(f" pass {p+1}: {r['error']}", flush=True)
passes.append({"error": r["error"], "parsed_ok": False, "latency_s": r["latency_s"]})
else:
entities = parse_entities(r["response"])
passes.append({
"raw": r["response"][:500],
"entities": entities,
"latency_s": r["latency_s"],
"parsed_ok": entities is not None,
"truncated_input": r.get("truncated", False),
})
all_parsed = all(p.get("parsed_ok") for p in passes)
if all_parsed:
e1, e2, e3 = passes[0]["entities"], passes[1]["entities"], passes[2]["entities"]
consistent = entities_match(e1, e2) and entities_match(e2, e3)
per_field = {
k: (e1[k] == e2[k] == e3[k])
for k in ("people", "organizations", "locations", "dates")
}
else:
consistent = False
per_field = None
latencies = [p.get("latency_s", 0) for p in passes]
print(f" parsed={all_parsed} consistent={consistent} latencies={latencies}", flush=True)
results.append({
"source": source,
"doc_chars": len(doc_text),
"passes": passes,
"all_parsed": all_parsed,
"consistent": consistent,
"per_field_consistency": per_field,
})
total_elapsed = round(time.time() - t_total, 1)
parsed = [r for r in results if r["all_parsed"]]
consistent = [r for r in parsed if r["consistent"]]
field_rates = {k: 0 for k in ("people", "organizations", "locations", "dates")}
for r in parsed:
for k, v in (r["per_field_consistency"] or {}).items():
if v:
field_rates[k] += 1
field_rates_pct = {
k: round(100 * v / len(parsed), 1) if parsed else 0.0
for k, v in field_rates.items()
}
summary = {
"experiment": "003",
"title": "Entity-Only Consistency Test",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"model": MODEL,
"n_passes": N_PASSES,
"per_call_timeout_s": PER_CALL_TIMEOUT,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(docs),
"n_all_parsed": len(parsed),
"n_fully_consistent": len(consistent),
"consistency_rate_pct": round(100 * len(consistent) / len(docs), 2) if docs else 0.0,
"consistency_rate_among_parsed_pct": (
round(100 * len(consistent) / len(parsed), 2) if parsed else 0.0
),
"per_field_consistency_pct": field_rates_pct,
"total_elapsed_s": total_elapsed,
"exp_001_baseline_pct": 18.0,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(docs)} docs in {total_elapsed}s")
print(f"All 3 passes parsed cleanly: {len(parsed)}/{len(docs)}")
print(f"Fully consistent (all 4 fields match): {len(consistent)}/{len(docs)} ({summary['consistency_rate_pct']}%)")
print(f"Among parsed only: {summary['consistency_rate_among_parsed_pct']}%")
print(f"Per-field consistency: {field_rates_pct}")
print(f"Exp 001 baseline: 18% | delta: {summary['consistency_rate_pct'] - 18.0:+.2f} pts")
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+551
View File
@@ -0,0 +1,551 @@
"""
Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
Reads entities from FalkorDB group_id 'aaron', infers light type labels,
computes pairwise similarity within type blocks using ego summary embedding +
name string distance + neighbor pattern overlap, generates merge proposals
above threshold, writes proposal log for human review.
Does NOT execute merges. 0.1 is the calibration phase — proposals only,
human reviews before any action.
"""
import json
import re
import os
import time
from datetime import datetime, timezone
from collections import defaultdict
from pathlib import Path
import requests
from falkordb import FalkorDB
import numpy as np
# Configuration
GROUP_ID = "aaron"
HIGH_CONFIDENCE_THRESHOLD = 0.85 # propose merge above this
LOW_CONFIDENCE_THRESHOLD = 0.65 # log as low-confidence below
PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
def cosine_similarity(a, b):
"""Cosine similarity between two embedding vectors."""
a = np.array(a, dtype=np.float32)
b = np.array(b, dtype=np.float32)
na = np.linalg.norm(a)
nb = np.linalg.norm(b)
if na == 0 or nb == 0:
return 0.0
return float(np.dot(a, b) / (na * nb))
def name_similarity(name_a, name_b):
"""
Token-overlap-based name similarity.
Handles formal/informal pairs (Aaron / Aaron Nelson),
abbreviation pairs (HVAMC / Hudson Valley AMC),
and simple transcription noise.
"""
a_lower = name_a.lower().strip()
b_lower = name_b.lower().strip()
if a_lower == b_lower:
return 1.0
# Tokenize
a_tokens = set(re.findall(r'\b\w+\b', a_lower))
b_tokens = set(re.findall(r'\b\w+\b', b_lower))
if not a_tokens or not b_tokens:
return 0.0
# Substring containment (handles "Aaron" in "Aaron Nelson")
if a_lower in b_lower or b_lower in a_lower:
# Strong signal but not 1.0 — different lengths
shorter = min(len(a_lower), len(b_lower))
longer = max(len(a_lower), len(b_lower))
return 0.7 + 0.2 * (shorter / longer)
# Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
intersection = a_tokens & b_tokens
union = a_tokens | b_tokens
jaccard = len(intersection) / len(union)
# Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
def is_acronym(short, full):
if len(short) >= len(full):
return False
if not short.isupper():
short_upper = short.upper()
else:
short_upper = short
full_words = full.split()
if len(full_words) < 2:
return False
first_letters = ''.join(w[0].upper() for w in full_words if w)
return short_upper == first_letters or short_upper in first_letters
if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
return 0.85
return jaccard
def infer_type(entity_name, summary):
"""
Light type inference for blocking. Heuristic-based, transparent.
Returns one of: person, organization, project, place, concept, unknown.
NOT a precise classification — just enough to avoid obviously wrong
cross-type comparisons (person vs project). When in doubt, return
'unknown' which gets compared against everything.
"""
name_lower = entity_name.lower().strip()
summary_lower = (summary or "").lower()
# Person: name patterns
person_indicators = [
# First+Last name pattern (two title-cased words, no other tokens)
bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
# Single name that's also in the summary as a person
any(phrase in summary_lower for phrase in [
'is a person', 'is a professor', 'is an artist', 'is a colleague',
'is a friend', 'is a family member', 'works at', 'studied at',
"'s spouse", "'s child", "'s parent", "'s student",
]),
]
if any(person_indicators):
return "person"
# Organization: company/institution indicators
org_indicators = [
any(suffix in name_lower for suffix in [
' inc', ' llc', ' corp', ' company', ' university', ' college',
' school', ' institute', ' foundation', ' department',
]),
any(phrase in summary_lower for phrase in [
'is a company', 'is a university', 'is an organization',
'is an institution', 'is a department', 'is a nonprofit',
]),
]
if any(org_indicators):
return "organization"
# Project: software/creative work indicators
project_indicators = [
any(phrase in summary_lower for phrase in [
'is a project', 'software project', 'is a codebase',
'is a tool', 'is a system', 'is an application',
'is a research project', 'is a design project',
]),
any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
]
if any(project_indicators):
return "project"
# Place: location indicators
place_indicators = [
any(phrase in summary_lower for phrase in [
'is a city', 'is a town', 'is a state', 'is a country',
'is a neighborhood', 'is a region', 'is a location',
]),
]
if any(place_indicators):
return "place"
# Default
return "unknown"
def get_neighbors(graph, entity_uuid, limit=20):
"""Get the names of entities connected to this entity (1-hop)."""
query = """
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
RETURN DISTINCT other.name AS name
LIMIT $limit
"""
result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
return set(row[0] for row in result.result_set if row[0])
def neighbor_jaccard(neighbors_a, neighbors_b):
"""
Asymmetric neighbor overlap (containment metric).
Returns |A ∩ B| / min(|A|, |B|) — the fraction of the smaller entity's
neighbors that are also neighbors of the larger entity.
Asymmetric is the right metric for personal cognitive corpora, where
one entity (e.g., the user) is a hub with hundreds of edges and alias
candidates are smaller subset entities. Jaccard penalizes this
asymmetry as if it were dissimilarity; containment reveals it as the
subset relationship it is.
DEG-RAG used Jaccard because their academic-corpus entities are
roughly comparable in connectivity. Personal corpora have different
topology and need a different metric.
"""
if not neighbors_a and not neighbors_b:
return 0.0
intersection = neighbors_a & neighbors_b
smaller = min(len(neighbors_a), len(neighbors_b))
if smaller == 0:
return 0.0
return len(intersection) / smaller
def get_edge_count(graph, entity_uuid):
query = """
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
RETURN count(r) AS c
"""
result = graph.query(query, {"uuid": entity_uuid})
return result.result_set[0][0] if result.result_set else 0
def combine_signals(name_sim, ego_sim, neighbor_sim):
"""
Combine the three similarity signals into a single confidence score.
Weighting tuned for personal cognitive corpora:
- Summary embedding ego similarity is primary signal
- Containment-based neighbor overlap is strong secondary (catches Aaron+Nelson
where the smaller entity's neighbors are mostly a subset of the hub's)
- Name similarity is tie-breaker (handles acronyms via name_similarity helper)
Different from DEG-RAG defaults because personal corpora have asymmetric
topology (hub user, subset alias entities).
"""
# Strong neighbor containment alone is meaningful — if entity B's neighbors
# are mostly contained in entity A's, even with different names and weak
# name_embedding similarity, that's the asymmetric alias case (Aaron+Nelson).
# Require some ego support but not high.
if neighbor_sim >= 0.7 and ego_sim >= 0.3:
return 0.4 * neighbor_sim + 0.4 * ego_sim + 0.2 * name_sim
# If ego is very low AND neighbor overlap is weak, probably not aliases
if ego_sim < 0.3 and neighbor_sim < 0.4:
return min(0.4, max(ego_sim, neighbor_sim))
# If name is very similar AND ego is at least moderate, high confidence
if name_sim >= 0.85 and ego_sim >= 0.5:
return 0.4 * ego_sim + 0.4 * name_sim + 0.2 * neighbor_sim
# Standard weighted average — ego primary, neighbor and name balanced
return 0.45 * ego_sim + 0.3 * neighbor_sim + 0.25 * name_sim
def compute_summary_embedding(text, model="nomic-embed-text"):
"""
Compute embedding for a summary text via Ollama.
Used to get ego similarity between entities based on what their summaries
say (the actual semantic content) rather than just their names. Aaron's
name_embedding and Nelson's name_embedding have low cosine similarity
because the names are different tokens. But their summaries describe
overlapping content (faculty member at SUNY, HVAMC, etc.) so summary
embeddings should produce a much stronger ego signal.
"""
if not text or len(text) < 10:
return None
try:
response = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": model, "prompt": text[:2000]},
timeout=30,
)
response.raise_for_status()
return response.json().get("embedding")
except Exception as e:
print(f" Embedding error: {e}")
return None
def precompute_summary_embeddings(entities, model="nomic-embed-text"):
"""Compute and cache summary embeddings for all entities."""
print(f"Computing summary embeddings via Ollama ({model})...")
print(f" Total entities: {len(entities)}")
cache_path = Path("/home/aaron/aaronai/experiments/summary_embeddings_cache.json")
cache = {}
if cache_path.exists():
with open(cache_path) as f:
cache = json.load(f)
print(f" Loaded {len(cache)} cached embeddings")
new_count = 0
start = time.time()
for i, e in enumerate(entities):
if e["uuid"] in cache:
e["summary_embedding"] = cache[e["uuid"]]
continue
emb = compute_summary_embedding(e["summary"], model=model)
if emb:
e["summary_embedding"] = emb
cache[e["uuid"]] = emb
new_count += 1
else:
e["summary_embedding"] = None
# Save cache periodically
if new_count > 0 and new_count % 100 == 0:
with open(cache_path, "w") as f:
json.dump(cache, f)
elapsed = time.time() - start
rate = new_count / elapsed
remaining = (len(entities) - i - 1) / rate if rate > 0 else 0
print(f" ... {i+1}/{len(entities)} (computed {new_count} new, ~{remaining:.0f}s remaining)")
# Final save
with open(cache_path, "w") as f:
json.dump(cache, f)
have_embeddings = sum(1 for e in entities if e.get("summary_embedding"))
print(f" Done. {have_embeddings}/{len(entities)} entities have summary embeddings")
def generate_proposals():
db = FalkorDB(host='localhost', port=6379)
graph = db.select_graph(GROUP_ID)
# Pull all entities with embeddings
print(f"Fetching entities from group_id '{GROUP_ID}'...")
result = graph.query("""
MATCH (n:Entity)
WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
RETURN n.uuid, n.name, n.summary, n.name_embedding
""")
entities = []
for row in result.result_set:
entities.append({
'uuid': row[0],
'name': row[1],
'summary': row[2],
'embedding': row[3],
})
print(f" Loaded {len(entities)} entities with embeddings")
# Compute summary embeddings (true ego signal, beyond name embeddings)
precompute_summary_embeddings(entities)
# Infer types for blocking
print("Inferring entity types for blocking...")
type_counts = defaultdict(int)
for e in entities:
e['inferred_type'] = infer_type(e['name'], e['summary'])
type_counts[e['inferred_type']] += 1
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t}: {c}")
# Group by inferred type for blocking
blocks = defaultdict(list)
for e in entities:
blocks[e['inferred_type']].append(e)
# 'unknown' entities get compared against everything (they might be any type)
# Other types only get compared within their type block + against unknowns
print()
print("Comparing entities within type blocks...")
proposals = []
low_confidence = []
comparisons_done = 0
# Build comparison pairs
pairs_to_compare = []
typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
unknown_block = blocks.get('unknown', [])
# Within-type pairs (excluding unknown)
for t, ents in typed_blocks.items():
for i in range(len(ents)):
for j in range(i + 1, len(ents)):
pairs_to_compare.append((ents[i], ents[j]))
# Unknown vs unknown
for i in range(len(unknown_block)):
for j in range(i + 1, len(unknown_block)):
pairs_to_compare.append((unknown_block[i], unknown_block[j]))
# Unknown vs typed (unknowns might be any type)
for ent_unknown in unknown_block:
for t, ents in typed_blocks.items():
for ent_typed in ents:
pairs_to_compare.append((ent_unknown, ent_typed))
print(f" Pairs to compare: {len(pairs_to_compare):,}")
# Compute similarities
cache_neighbors = {}
def neighbors_cached(uuid):
if uuid not in cache_neighbors:
cache_neighbors[uuid] = get_neighbors(graph, uuid)
return cache_neighbors[uuid]
for ent_a, ent_b in pairs_to_compare:
comparisons_done += 1
if comparisons_done % 5000 == 0:
print(f" ... {comparisons_done:,} / {len(pairs_to_compare):,}")
# Compute name similarity (handles formal/informal pairs, acronyms)
name_sim = name_similarity(ent_a['name'], ent_b['name'])
# Compute ego similarity using SUMMARY embeddings (the actual semantic
# content), falling back to name embeddings if summaries unavailable.
# Summary similarity catches Aaron+Nelson where name similarity fails.
if ent_a.get('summary_embedding') and ent_b.get('summary_embedding'):
ego_sim_quick = cosine_similarity(ent_a['summary_embedding'], ent_b['summary_embedding'])
else:
ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
# Pre-filter to avoid expensive neighbor query on obviously different pairs.
# Lowered thresholds vs DEG-RAG defaults because personal-corpus aliases often
# have low name_embedding similarity (different surface tokens) but high
# neighbor overlap. We let weaker name/ego signals through to the neighbor
# check, which can rescue them via containment metric.
if ego_sim_quick < 0.3 and name_sim < 0.15:
continue
# Full comparison
neighbors_a = neighbors_cached(ent_a['uuid'])
neighbors_b = neighbors_cached(ent_b['uuid'])
neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
record = {
'entity_a': {
'uuid': ent_a['uuid'],
'name': ent_a['name'],
'type': ent_a['inferred_type'],
'summary': ent_a['summary'][:200],
'edge_count': get_edge_count(graph, ent_a['uuid']),
},
'entity_b': {
'uuid': ent_b['uuid'],
'name': ent_b['name'],
'type': ent_b['inferred_type'],
'summary': ent_b['summary'][:200],
'edge_count': get_edge_count(graph, ent_b['uuid']),
},
'confidence': round(confidence, 3),
'signals': {
'name_similarity': round(name_sim, 3),
'ego_similarity': round(ego_sim_quick, 3),
'neighbor_overlap': round(neighbor_sim, 3),
},
'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
}
if confidence >= HIGH_CONFIDENCE_THRESHOLD:
proposals.append(record)
elif confidence >= LOW_CONFIDENCE_THRESHOLD:
low_confidence.append(record)
print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
return proposals, low_confidence, len(entities), len(pairs_to_compare)
def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
lines = []
lines.append(f"# Consolidator 0.1 — Run {timestamp}")
lines.append("")
lines.append("## Statistics")
lines.append(f"- Entities scanned: {total_entities:,}")
lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
lines.append("")
lines.append("## How to review")
lines.append("")
lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
lines.append("- `[APPROVE]` — execute this merge on next run")
lines.append("- `[REJECT]` — don't merge, don't propose again")
lines.append("- `[DEFER]` — re-surface in next run for further consideration")
lines.append("")
lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
lines.append("")
lines.append("---")
lines.append("")
lines.append(f"## Proposed Merges (n={len(proposals)})")
lines.append("")
for i, p in enumerate(proposals_sorted, start=1):
lines.append(f"### Proposal {i}")
lines.append("")
lines.append(f"**Decision:** [ ]")
lines.append("")
lines.append(f"**Confidence:** {p['confidence']}")
lines.append("")
lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
lines.append(f" - uuid: `{p['entity_a']['uuid']}`")
lines.append(f" - summary: {p['entity_a']['summary']}")
lines.append("")
lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
lines.append(f" - uuid: `{p['entity_b']['uuid']}`")
lines.append(f" - summary: {p['entity_b']['summary']}")
lines.append("")
lines.append(f"**Signals:**")
lines.append(f" - Name similarity: {p['signals']['name_similarity']}")
lines.append(f" - Ego (summary) similarity: {p['signals']['ego_similarity']}")
lines.append(f" - Neighbor overlap: {p['signals']['neighbor_overlap']}")
if p['shared_neighbors']:
shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
lines.append(f" - Shared neighbors (sample): {shared_str}")
lines.append("")
lines.append("**Optional rejection note:** ")
lines.append("")
lines.append("---")
lines.append("")
lines.append("")
lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
lines.append("")
for p in low_sorted[:30]:
lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
if len(low_sorted) > 30:
lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
out_path.write_text("\n".join(lines))
print(f"\nProposal log written to: {out_path}")
# Also save raw JSON for downstream tooling
json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
with open(json_path, 'w') as f:
json.dump({
'run_timestamp': timestamp,
'statistics': {
'total_entities': total_entities,
'total_comparisons': total_comparisons,
'proposal_count': len(proposals),
'low_confidence_count': len(low_confidence),
},
'proposals': proposals_sorted,
'low_confidence': low_sorted,
}, f, indent=2)
print(f"Raw JSON: {json_path}")
def main():
print("=" * 70)
print("Consolidator 0.1 — Calibration Phase")
print("=" * 70)
print()
proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
print()
print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
print("for each proposal. Re-run will read decisions and execute approved merges.")
if __name__ == "__main__":
main()
+179
View File
@@ -0,0 +1,179 @@
"""
Measure actual Graphiti BULK episode cost on a stratified sample.
Uses /episodes/bulk endpoint. Submits in small batches to avoid rate limits.
"""
import json, os, random, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
SAMPLE_SIZE = 50
BATCH_SIZE = 5
RANDOM_SEED = 42
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
OUT.parent.mkdir(parents=True, exist_ok=True)
def fetch_stratified_sample():
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("""
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
FROM embeddings
GROUP BY source
""")
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
cur.close(); conn.close()
random.seed(RANDOM_SEED)
short = [(s, d) for s, d in sources if len(d) < 1000]
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
sample = (
random.sample(short, min(15, len(short))) +
random.sample(medium, min(25, len(medium))) +
random.sample(long_, min(10, len(long_)))
)
print(f"Sample: {len(sample)} sources, batch_size={BATCH_SIZE}")
return sample
def submit_bulk_batch(batch):
payload = {
"episodes": [
{
"name": source,
"content": doc[:12000],
"source_description": "pgvector_migration_bulk_test",
"timestamp": "2026-04-28T00:00:00",
}
for source, doc in batch
]
}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
elapsed = time.time() - t0
return {
"batch_size": len(batch),
"status_code": r.status_code,
"elapsed_s": round(elapsed, 2),
"elapsed_per_episode_s": round(elapsed / len(batch), 2),
"response": r.json() if r.ok else None,
"error": None if r.ok else r.text[:500],
"sources": [s for s, _ in batch],
}
except Exception as e:
return {
"batch_size": len(batch),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"elapsed_per_episode_s": None,
"response": None,
"error": str(e)[:500],
"sources": [s for s, _ in batch],
}
def main():
print("=" * 60)
print("Graphiti BULK Migration Cost Test (Haiku 4.5)")
print("=" * 60)
print()
print("BEFORE running:")
print(" 1. Open https://console.anthropic.com/settings/usage")
print(" 2. Note current spend.")
print()
input("Press Enter when noted... ")
print()
sample = fetch_stratified_sample()
if not sample:
print("ERROR: empty sample"); return
batches = [sample[i:i+BATCH_SIZE] for i in range(0, len(sample), BATCH_SIZE)]
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE} episodes")
print()
results = []
total_start = time.time()
for i, batch in enumerate(batches, start=1):
avg_chars = int(sum(len(d) for _, d in batch) / len(batch))
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg_chars:6d}",
end=" ", flush=True)
result = submit_bulk_batch(batch)
results.append(result)
if result["error"]:
print(f" ERROR: {result['error'][:80]}")
if "429" in (result["error"] or "") or "rate" in (result["error"] or "").lower():
print(" Rate limited - pausing 30s before next batch")
time.sleep(30)
else:
print(f" {result['status_code']} {result['elapsed_s']}s "
f"({result['elapsed_per_episode_s']}s/episode)")
total_elapsed = time.time() - total_start
successful_batches = [r for r in results if r["error"] is None]
failed_batches = [r for r in results if r["error"] is not None]
successful_episodes = sum(r["batch_size"] for r in successful_batches)
failed_episodes = sum(r["batch_size"] for r in failed_batches)
summary = {
"sample_size": len(sample),
"batch_size": BATCH_SIZE,
"n_batches": len(batches),
"successful_batches": len(successful_batches),
"failed_batches": len(failed_batches),
"successful_episodes": successful_episodes,
"failed_episodes": failed_episodes,
"total_elapsed_s": round(total_elapsed, 1),
"mean_elapsed_per_episode_s": round(
sum(r["elapsed_s"] for r in successful_batches) /
max(successful_episodes, 1), 2
) if successful_episodes else None,
"results": results,
}
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
total_sources = cur.fetchone()[0]
cur.close(); conn.close()
summary["total_corpus_sources"] = total_sources
if summary["mean_elapsed_per_episode_s"]:
summary["estimated_migration_hours"] = round(
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
)
OUT.write_text(json.dumps(summary, indent=2))
print()
print("=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Episodes: {summary['successful_episodes']}/{summary['sample_size']} succeeded")
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
print(f"Total elapsed: {summary['total_elapsed_s']}s")
if summary["mean_elapsed_per_episode_s"]:
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
print(f"Total corpus sources: {summary['total_corpus_sources']}")
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
print()
print(f"AFTER:")
print(f" Wait 5 min; note new Anthropic spend; subtract from $28.61 baseline.")
print(f" delta / {summary['successful_episodes']} = per-episode cost")
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
print()
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
+122
View File
@@ -0,0 +1,122 @@
"""
Retest just the previously-failed batches after raising MAX_QUEUED_QUERIES.
Reads failed sources from graphiti_bulk_cost_test.json and resubmits.
"""
import json, os, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
BATCH_SIZE = 5
PRIOR_RESULTS = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
def fetch_doc_for_source(cur, source):
cur.execute("""
SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id)
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
return row[0] if row else None
def submit_bulk_batch(batch):
payload = {"episodes": [
{"name": s, "content": d[:12000],
"source_description": "pgvector_migration_bulk_retry",
"timestamp": "2026-04-28T00:00:00"}
for s, d in batch
]}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
return {
"batch_size": len(batch),
"status_code": r.status_code,
"elapsed_s": round(time.time() - t0, 2),
"elapsed_per_episode_s": round((time.time() - t0) / len(batch), 2),
"error": None if r.ok else r.text[:500],
"sources": [s for s, _ in batch],
}
except Exception as e:
return {
"batch_size": len(batch),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"elapsed_per_episode_s": None,
"error": str(e)[:500],
"sources": [s for s, _ in batch],
}
def main():
prior = json.loads(PRIOR_RESULTS.read_text())
failed_sources = []
for batch_result in prior["results"]:
if batch_result["error"] is not None:
failed_sources.extend(batch_result["sources"])
print(f"Retrying {len(failed_sources)} previously-failed sources")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
sources_with_docs = []
for s in failed_sources:
doc = fetch_doc_for_source(cur, s)
if doc:
sources_with_docs.append((s, doc))
else:
print(f" WARN: could not find doc for source {s}")
cur.close(); conn.close()
print(f"Loaded {len(sources_with_docs)} source docs")
print()
batches = [sources_with_docs[i:i+BATCH_SIZE]
for i in range(0, len(sources_with_docs), BATCH_SIZE)]
results = []
total_start = time.time()
for i, batch in enumerate(batches, start=1):
avg = int(sum(len(d) for _, d in batch) / len(batch))
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}",
end=" ", flush=True)
result = submit_bulk_batch(batch)
results.append(result)
if result["error"]:
print(f" ERROR: {result['error'][:80]}")
else:
print(f" {result['status_code']} {result['elapsed_s']}s")
total_elapsed = time.time() - total_start
successful = [r for r in results if r["error"] is None]
failed = [r for r in results if r["error"] is not None]
summary = {
"n_retry_sources": len(sources_with_docs),
"n_batches": len(batches),
"successful_batches": len(successful),
"failed_batches": len(failed),
"successful_episodes": sum(r["batch_size"] for r in successful),
"failed_episodes": sum(r["batch_size"] for r in failed),
"total_elapsed_s": round(total_elapsed, 1),
"results": results,
}
OUT.write_text(json.dumps(summary, indent=2))
print()
print("=" * 60)
print("RETRY RESULTS")
print("=" * 60)
print(f"Episodes: {summary['successful_episodes']}/{len(sources_with_docs)} succeeded")
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
print(f"Total elapsed: {summary['total_elapsed_s']}s")
print()
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
+93
View File
@@ -0,0 +1,93 @@
"""Retry attempt #2 — for sources that timed out after MAX_QUEUED_QUERIES bump."""
import json, os, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
BATCH_SIZE = 3 # smaller batches given timeouts
PRIOR = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry2.json"
def fetch_doc(cur, source):
cur.execute("SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id) FROM embeddings WHERE source = %s", (source,))
row = cur.fetchone()
return row[0] if row else None
def submit_batch(batch):
payload = {"episodes": [
{"name": s, "content": d[:12000],
"source_description": "pgvector_migration_bulk_retry2",
"timestamp": "2026-04-28T00:00:00"}
for s, d in batch
]}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
return {
"batch_size": len(batch),
"status_code": r.status_code,
"elapsed_s": round(time.time() - t0, 2),
"error": None if r.ok else r.text[:500],
"sources": [s for s, _ in batch],
}
except Exception as e:
return {
"batch_size": len(batch),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"error": str(e)[:500],
"sources": [s for s, _ in batch],
}
def main():
prior = json.loads(PRIOR.read_text())
failed = []
for r in prior["results"]:
if r["error"] is not None:
failed.extend(r["sources"])
print(f"Retry #2: {len(failed)} sources still failing")
conn = psycopg2.connect(PG_DSN); cur = conn.cursor()
sources = []
for s in failed:
d = fetch_doc(cur, s)
if d: sources.append((s, d))
cur.close(); conn.close()
batches = [sources[i:i+BATCH_SIZE] for i in range(0, len(sources), BATCH_SIZE)]
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE}\n")
results = []
for i, batch in enumerate(batches, 1):
avg = int(sum(len(d) for _, d in batch) / len(batch))
print(f"[batch {i}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}", end=" ", flush=True)
r = submit_batch(batch)
results.append(r)
if r["error"]: print(f" ERROR: {r['error'][:80]}")
else: print(f" {r['status_code']} {r['elapsed_s']}s")
succ = [r for r in results if r["error"] is None]
fail = [r for r in results if r["error"] is not None]
summary = {
"n_sources": len(sources),
"successful_batches": len(succ),
"failed_batches": len(fail),
"successful_episodes": sum(r["batch_size"] for r in succ),
"failed_episodes": sum(r["batch_size"] for r in fail),
"results": results,
}
OUT.write_text(json.dumps(summary, indent=2))
print()
print(f"Episodes: {summary['successful_episodes']}/{len(sources)} succeeded")
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
+175
View File
@@ -0,0 +1,175 @@
"""
Measure actual Graphiti episode-add cost on a stratified sample of pgvector sources.
"""
import json, os, random, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
SAMPLE_SIZE = 50
RANDOM_SEED = 42
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_cost_test.json"
OUT.parent.mkdir(parents=True, exist_ok=True)
def fetch_stratified_sample():
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("""
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
FROM embeddings
GROUP BY source
""")
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
cur.close(); conn.close()
random.seed(RANDOM_SEED)
short = [(s, d) for s, d in sources if len(d) < 1000]
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
sample = (
random.sample(short, min(15, len(short))) +
random.sample(medium, min(25, len(medium))) +
random.sample(long_, min(10, len(long_)))
)
print(f"Sample: {len(sample)} sources")
return sample
def submit_episode(source: str, document: str) -> dict:
payload = {
"name": source,
"content": document[:12000],
"source_description": "pgvector_migration_cost_test",
"timestamp": "2026-04-28T00:00:00",
}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes", json=payload, timeout=600)
return {
"source": source,
"doc_chars": len(document),
"doc_chars_sent": min(len(document), 12000),
"status_code": r.status_code,
"elapsed_s": round(time.time() - t0, 2),
"error": None if r.ok else r.text[:500],
}
except Exception as e:
return {
"source": source,
"doc_chars": len(document),
"doc_chars_sent": min(len(document), 12000),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"error": str(e)[:500],
}
def main():
print("=" * 60)
print("Graphiti Migration Cost Test (Haiku 4.5)")
print("=" * 60)
print()
print("BEFORE running:")
print(" 1. Open https://console.anthropic.com/settings/usage")
print(" 2. Note current spend.")
print()
input("Press Enter when noted... ")
print()
sample = fetch_stratified_sample()
if not sample:
print("ERROR: empty sample"); return
# Smoke test
print(f"Smoke test on first source ({sample[0][0][:50]}...):")
smoke = submit_episode(*sample[0])
print(f" status={smoke['status_code']} elapsed={smoke['elapsed_s']}s")
if smoke["error"]:
print(f" ERROR: {smoke['error']}")
OUT.write_text(json.dumps({"smoke_test": smoke}, indent=2))
print("Halted — fix smoke test before bulk run.")
return
print(f" OK. Proceeding with {len(sample)} sources.")
print()
results = [smoke]
total_start = time.time()
for i, (source, doc) in enumerate(sample[1:], start=2):
bucket = "short" if len(doc) < 1000 else "medium" if len(doc) < 5000 else "long"
print(f"[{i:2d}/{len(sample)}] [{bucket:6s}] [{len(doc):6d}c] {source[:50]:50s}", end=" ", flush=True)
result = submit_episode(source, doc)
results.append(result)
if result["error"]:
print(f" ERROR: {result['error'][:80]}")
else:
print(f" {result['status_code']} {result['elapsed_s']}s")
total_elapsed = time.time() - total_start
successful = [r for r in results if r["error"] is None]
failed = [r for r in results if r["error"] is not None]
summary = {
"sample_size": len(sample),
"successful": len(successful),
"failed": len(failed),
"total_elapsed_s": round(total_elapsed, 1),
"mean_elapsed_per_episode_s": round(
sum(r["elapsed_s"] for r in successful) / max(len(successful), 1), 2
),
"by_bucket": {},
"results": results,
}
for bname, lo, hi in [("short", 0, 1000), ("medium", 1000, 5000), ("long", 5000, 10**9)]:
b = [r for r in successful if lo <= r["doc_chars"] < hi]
if b:
summary["by_bucket"][bname] = {
"n": len(b),
"mean_elapsed_s": round(sum(r["elapsed_s"] for r in b) / len(b), 2),
"mean_chars": int(sum(r["doc_chars"] for r in b) / len(b)),
}
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
total_sources = cur.fetchone()[0]
cur.close(); conn.close()
summary["total_corpus_sources"] = total_sources
summary["estimated_migration_hours"] = round(
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
)
OUT.write_text(json.dumps(summary, indent=2))
print()
print("=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Sample: {summary['successful']}/{summary['sample_size']} succeeded, {summary['failed']} failed")
print(f"Total elapsed: {summary['total_elapsed_s']}s")
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
for bucket, stats in summary["by_bucket"].items():
print(f" {bucket:6s} n={stats['n']:3d} chars~{stats['mean_chars']:6d} elapsed~{stats['mean_elapsed_s']}s")
print()
print(f"Total corpus sources: {summary['total_corpus_sources']}")
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
print()
print("AFTER:")
print(" Wait 5 min; note new Anthropic spend; subtract.")
print(f" test_cost / {summary['successful']} = per-episode cost")
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
print()
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
+155
View File
@@ -0,0 +1,155 @@
"""
E1.4 per-source predicate diversity comparison — fixed version.
Looks up episode uuids by name in both production and cascade graphs.
"""
import json
from collections import defaultdict
from falkordb import FalkorDB
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
PRODUCTION_GROUP = "aaron"
CASCADE_GROUP = "aaron_cascade_e14"
def get_predicates_for_episode(graph, episode_uuid):
query = """
MATCH ()-[r:RELATES_TO]->()
WHERE $uuid IN r.episodes
RETURN count(DISTINCT r.name) AS predicate_count
"""
result = graph.query(query, {"uuid": episode_uuid})
rows = result.result_set
return rows[0][0] if rows else 0
def get_edge_count_for_episode(graph, episode_uuid):
query = """
MATCH ()-[r:RELATES_TO]->()
WHERE $uuid IN r.episodes
RETURN count(r) AS edge_count
"""
result = graph.query(query, {"uuid": episode_uuid})
rows = result.result_set
return rows[0][0] if rows else 0
def find_episode_uuid(graph, source_name):
query = """
MATCH (e:Episodic {name: $name})
RETURN e.uuid AS uuid
LIMIT 1
"""
result = graph.query(query, {"name": source_name})
rows = result.result_set
return rows[0][0] if rows else None
def main():
db = FalkorDB(host='localhost', port=6379)
prod_graph = db.select_graph(PRODUCTION_GROUP)
cascade_graph = db.select_graph(CASCADE_GROUP)
with open(E14_RESULTS) as f:
e14 = json.load(f)
sources = [r for r in e14['results'] if 'submit_result' in r]
print(f"Analyzing {len(sources)} sources...")
print()
comparisons = []
missing_prod = 0
missing_cascade = 0
for src in sources:
name = src['name']
bucket = src['bucket']
prod_uuid = find_episode_uuid(prod_graph, name)
cascade_uuid = find_episode_uuid(cascade_graph, name)
if not prod_uuid:
missing_prod += 1
print(f" WARN: missing in production: {name}")
continue
if not cascade_uuid:
missing_cascade += 1
print(f" WARN: missing in cascade: {name}")
continue
prod_preds = get_predicates_for_episode(prod_graph, prod_uuid)
cascade_preds = get_predicates_for_episode(cascade_graph, cascade_uuid)
prod_edges = get_edge_count_for_episode(prod_graph, prod_uuid)
cascade_edges = get_edge_count_for_episode(cascade_graph, cascade_uuid)
comparisons.append({
"name": name,
"bucket": bucket,
"prod_preds": prod_preds,
"cascade_preds": cascade_preds,
"delta_preds": cascade_preds - prod_preds,
"prod_edges": prod_edges,
"cascade_edges": cascade_edges,
"delta_edges": cascade_edges - prod_edges,
})
if missing_prod or missing_cascade:
print()
print(f"Missing: {missing_prod} prod, {missing_cascade} cascade")
print()
if not comparisons:
print("No comparable sources found. Aborting.")
return
# Per-source detail
print(f"{'Bucket':<10} {'Source':<58} {'Preds A→B':<14} {'Δ':<6} {'Edges A→B':<14} {'Δ'}")
print("-" * 115)
for c in sorted(comparisons, key=lambda x: (x['bucket'], x['name'])):
name_short = (c['name'][:55] + '..') if len(c['name']) > 58 else c['name']
preds_str = f"{c['prod_preds']}{c['cascade_preds']}"
edges_str = f"{c['prod_edges']}{c['cascade_edges']}"
print(f"{c['bucket']:<10} {name_short:<58} {preds_str:<14} {c['delta_preds']:+d} {edges_str:<14} {c['delta_edges']:+d}")
# Per-bucket aggregation
print()
print("=" * 115)
print("PER-BUCKET AGGREGATION")
print("=" * 115)
by_bucket = defaultdict(list)
for c in comparisons:
by_bucket[c['bucket']].append(c)
for bucket in ['high', 'mid', 'low', 'document']:
items = by_bucket.get(bucket, [])
if not items:
continue
n = len(items)
sum_pp = sum(c['prod_preds'] for c in items)
sum_cp = sum(c['cascade_preds'] for c in items)
sum_pe = sum(c['prod_edges'] for c in items)
sum_ce = sum(c['cascade_edges'] for c in items)
positive = sum(1 for c in items if c['delta_preds'] > 0)
negative = sum(1 for c in items if c['delta_preds'] < 0)
flat = sum(1 for c in items if c['delta_preds'] == 0)
pct_pred = ((sum_cp - sum_pp) / sum_pp * 100) if sum_pp else 0
pct_edge = ((sum_ce - sum_pe) / sum_pe * 100) if sum_pe else 0
print(f"\n{bucket.upper()} (n={n}):")
print(f" Predicates: {sum_pp}{sum_cp} ({pct_pred:+.1f}%)")
print(f" Edges: {sum_pe}{sum_ce} ({pct_edge:+.1f}%)")
print(f" Outcomes: {positive} positive, {flat} flat, {negative} negative")
# Aggregate
print()
print("=" * 115)
print(f"AGGREGATE (n={len(comparisons)})")
print("=" * 115)
total_pp = sum(c['prod_preds'] for c in comparisons)
total_cp = sum(c['cascade_preds'] for c in comparisons)
total_pe = sum(c['prod_edges'] for c in comparisons)
total_ce = sum(c['cascade_edges'] for c in comparisons)
print(f" Predicates: {total_pp}{total_cp} ({(total_cp-total_pp)/total_pp*100:+.1f}%)")
print(f" Edges: {total_pe}{total_ce} ({(total_ce-total_pe)/total_pe*100:+.1f}%)")
out_path = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
with open(out_path, "w") as f:
json.dump(comparisons, f, indent=2)
print()
print(f"Saved to {out_path}")
if __name__ == "__main__":
main()
+208
View File
@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""E1.4 orchestration — cascade re-extraction at n=30, group_id=aaron_cascade_e14."""
import json
import os
import requests
import time
from pathlib import Path
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "e14_sample.json"
RESULTS_FILE = EXPERIMENTS / "e14_cascade_results.json"
PG_DSN = os.environ["PG_DSN"]
SIDECAR_URL = "http://localhost:8001"
TEST_GROUP_ID = "aaron_cascade_e14"
MAX_DOC_CHARS = 12000
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
def get_pg():
return psycopg2.connect(PG_DSN)
def fetch_source_text(source):
conn = get_pg()
cur = conn.cursor()
cur.execute("""
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
conn.close()
if row is None or row[0] is None:
return None
return row[0]
def run_mistral_metadata(text, max_retries=2):
truncated = text[:MAX_DOC_CHARS]
prompt = METADATA_PROMPT + truncated
last_err = None
for attempt in range(max_retries):
try:
response = requests.post(
"http://localhost:11434/api/generate",
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
timeout=300,
)
response.raise_for_status()
raw = response.json()["response"]
try:
metadata = json.loads(raw)
metadata["char_length"] = len(truncated)
return metadata
except json.JSONDecodeError:
return {"error": "JSON parse failed", "raw": raw[:500]}
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e:
last_err = e
if attempt < max_retries - 1:
print(f" (retry {attempt+1} after {type(e).__name__})", end=" ", flush=True)
time.sleep(5)
continue
return {"error": f"After {max_retries} retries: {last_err}"}
def format_metadata_as_orientation(metadata):
if "error" in metadata:
return None
summary = metadata.get("one_sentence_summary", "")
domain = metadata.get("domain_class", "unknown")
fmt = metadata.get("primary_format", "unknown")
return (
f"This is a {domain} document in {fmt} format. "
f"Summary: {summary} "
f"This metadata is provided to orient your extraction, not to constrain it. "
f"Extract entities and relationships freely from the document text itself; "
f"the metadata is descriptive context, not a checklist."
)
def submit_episode_singular(name, content, custom_instructions):
payload = {
"name": name,
"content": content[:MAX_DOC_CHARS],
"source_description": "e14_replication_run",
"timestamp": "2026-04-29T00:00:00",
"group_id": TEST_GROUP_ID,
"custom_extraction_instructions": custom_instructions,
}
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
response.raise_for_status()
return response.json()
def load_state():
if RESULTS_FILE.exists():
with open(RESULTS_FILE) as f:
data = json.load(f)
return data.get("results", []), {r["name"] for r in data.get("results", []) if "submit_result" in r}
return [], set()
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
results, completed = load_state()
if completed:
print(f"Resuming — {len(completed)} sources already completed, {len(selected) - len(completed)} remaining\n")
else:
print(f"E1.4 cascade replication — {len(selected)} episodes to group_id={TEST_GROUP_ID}\n")
for i, ep in enumerate(selected, 1):
name = ep["name"]
bucket = ep["bucket"]
if name in completed:
print(f"[{i}/{len(selected)}] [{bucket}] {name} — SKIP (already completed)")
continue
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
if ep.get("subtype"):
record["subtype"] = ep["subtype"]
print(f" Fetching source text...", end=" ", flush=True)
text = fetch_source_text(name)
if text is None:
print("FAILED — no chunks in pgvector")
record["error"] = "no source text"
results.append(record)
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
continue
record["doc_chars"] = len(text)
print(f"{len(text)} chars")
print(f" Generating Mistral metadata...", end=" ", flush=True)
t0 = time.time()
metadata = run_mistral_metadata(text)
elapsed = time.time() - t0
record["metadata"] = metadata
record["metadata_elapsed_s"] = round(elapsed, 1)
if "error" in metadata:
print(f"FAILED in {elapsed:.1f}s")
else:
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
custom_instructions = format_metadata_as_orientation(metadata)
record["custom_extraction_instructions"] = custom_instructions
print(f" Submitting via /episodes...", end=" ", flush=True)
t0 = time.time()
try:
result = submit_episode_singular(name, text, custom_instructions)
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — OK")
record["submit_elapsed_s"] = round(elapsed, 1)
record["submit_result"] = result
except Exception as e:
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — FAILED: {e}")
record["submit_error"] = str(e)
results.append(record)
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
print()
print(f"\nDone. Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()
+160
View File
@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
import json
import re
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
OUTPUT = EXPERIMENTS / "e14_sample.json"
TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}
def query_episode_counts():
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
"RETURN e.name AS name, count(distinct n) AS entities "
"ORDER BY entities DESC")
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
lines = [l for l in result.stdout.split("\n") if l.strip()]
episodes = []
i = 0
while i < len(lines):
if lines[i] == "name":
i += 2
continue
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
break
if i + 1 < len(lines):
try:
count = int(lines[i + 1])
episodes.append({"name": lines[i], "entities": count})
i += 2
except ValueError:
i += 1
else:
i += 1
return episodes
def is_document(name):
return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))
def doc_subtype(name):
"""Categorize document by likely subtype."""
s = name.lower()
if "syllabus" in s or "ind study" in s or "_is" in s:
return "academic"
if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
return "reference"
if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
return "reference"
if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
return "creative"
return "other"
def main():
print("Fetching episode entity counts from Tier 1 graph...")
episodes = query_episode_counts()
print(f"Got {len(episodes)} episodes")
# Load E1's sample to exclude
with open(E1_SAMPLE_FILE) as f:
e1_sample = json.load(f)
e1_names = {ep["name"] for ep in e1_sample["selected"]}
print(f"Excluding {len(e1_names)} sources from E1")
# Quartile boundaries
counts = sorted([e["entities"] for e in episodes], reverse=True)
n = len(counts)
top_q = counts[n // 4]
bottom_q = counts[3 * n // 4]
print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")
# Filter out E1 and bucket
available = [e for e in episodes if e["name"] not in e1_names]
high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]
print(f"\nAvailable after E1 exclusion:")
print(f" High-density: {len(high)}")
print(f" Mid-density: {len(mid)}")
print(f" Low-density: {len(low)}")
print(f" Documents: {len(docs)}")
# For high/mid/low: take from middle of bucket (avoids edge cases)
def pick(bucket, n):
if len(bucket) < n:
print(f" WARNING: only {len(bucket)} available, asked for {n}")
return bucket
mid_idx = len(bucket) // 2
start = max(0, mid_idx - n // 2)
return bucket[start:start + n]
selected = []
for ep in pick(high, TARGETS["high"]):
ep["bucket"] = "high"
selected.append(ep)
for ep in pick(mid, TARGETS["mid"]):
ep["bucket"] = "mid"
selected.append(ep)
for ep in pick(low, TARGETS["low"]):
ep["bucket"] = "low"
selected.append(ep)
# For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
doc_targets = {"academic": 2, "creative": 2, "reference": 2}
docs_by_subtype = {}
for ep in docs:
st = doc_subtype(ep["name"])
ep["subtype"] = st
docs_by_subtype.setdefault(st, []).append(ep)
print(f"\n Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")
# Pick from middle of each subtype bucket
for subtype, target in doc_targets.items():
sub_docs = docs_by_subtype.get(subtype, [])
picked = pick(sub_docs, target)
for ep in picked:
ep["bucket"] = "document"
selected.append(ep)
# If we're short on documents (e.g., subtype underrepresented), fill from "other"
doc_count = sum(1 for s in selected if s.get("bucket") == "document")
if doc_count < TARGETS["document"]:
shortage = TARGETS["document"] - doc_count
leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
for ep in leftover[:shortage]:
ep["bucket"] = "document"
ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
selected.append(ep)
print(f"\nSelected {len(selected)} episodes for E1.4:")
for ep in selected:
sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
print(f" [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e {ep['name']}")
with open(OUTPUT, "w") as f:
json.dump({
"metadata": {
"purpose": "E1.4 cascade re-extraction replication (n=30)",
"exclusions": "E1's 10 sources",
"stratification": {**TARGETS, "document_subtypes": doc_targets},
"quartile_top": top_q,
"quartile_bottom": bottom_q,
},
"selected": selected,
}, f, indent=2)
print(f"\nSaved to {OUTPUT}")
if __name__ == "__main__":
main()
+246
View File
@@ -0,0 +1,246 @@
"""
E1.6 analysis — correlate domain-purity ratings with cascade outcomes.
Applies pre-registered decision rules from E1.6 protocol.
"""
import json
from collections import defaultdict
RATINGS_PATH = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
COMPARISON_PATH = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
def spearman(xs, ys):
"""Compute Spearman rank correlation."""
n = len(xs)
if n < 2:
return None
# Rank the values
def rank(values):
sorted_idx = sorted(range(len(values)), key=lambda i: values[i])
ranks = [0] * len(values)
i = 0
while i < len(values):
j = i
while j + 1 < len(values) and values[sorted_idx[j+1]] == values[sorted_idx[i]]:
j += 1
avg_rank = (i + j) / 2 + 1
for k in range(i, j + 1):
ranks[sorted_idx[k]] = avg_rank
i = j + 1
return ranks
rx = rank(xs)
ry = rank(ys)
mean_rx = sum(rx) / n
mean_ry = sum(ry) / n
num = sum((rx[i] - mean_rx) * (ry[i] - mean_ry) for i in range(n))
den_x = (sum((rx[i] - mean_rx) ** 2 for i in range(n))) ** 0.5
den_y = (sum((ry[i] - mean_ry) ** 2 for i in range(n))) ** 0.5
if den_x == 0 or den_y == 0:
return None
return num / (den_x * den_y)
def main():
with open(RATINGS_PATH) as f:
ratings_data = json.load(f)
with open(COMPARISON_PATH) as f:
comparisons = json.load(f)
ratings_by_name = {r['name']: r for r in ratings_data['ratings']}
comp_by_name = {c['name']: c for c in comparisons}
# Join ratings with cascade outcomes
joined = []
for name, rating in ratings_by_name.items():
if name in comp_by_name:
comp = comp_by_name[name]
joined.append({
'name': name,
'binary': rating['binary'],
'score': rating['score'],
'note': rating.get('note'),
'bucket': comp['bucket'],
'delta_preds': comp['delta_preds'],
'delta_edges': comp['delta_edges'],
'prod_preds': comp['prod_preds'],
'cascade_preds': comp['cascade_preds'],
})
print("=" * 100)
print(f"E1.6 ANALYSIS — Domain Purity vs Cascade Outcome (n={len(joined)})")
print("=" * 100)
# Per-source detail with rating
print()
print(f"{'Bucket':<10} {'Source':<48} {'Domain':<8} {'Score':<6} {'Δpreds':<8} {'Δedges':<8}")
print("-" * 100)
for j in sorted(joined, key=lambda x: (x['binary'], -x['score'], x['bucket'], x['name'])):
name_short = (j['name'][:45] + '..') if len(j['name']) > 48 else j['name']
print(f"{j['bucket']:<10} {name_short:<48} {j['binary']:<8} {j['score']:<6} {j['delta_preds']:+d} {j['delta_edges']:+d}")
# PRIMARY TEST: binary purity vs cascade outcome distribution
print()
print("=" * 100)
print("PRIMARY TEST: Binary purity vs cascade outcome distribution")
print("=" * 100)
def categorize_outcome(delta):
if delta > 0:
return 'positive'
elif delta < 0:
return 'negative'
else:
return 'flat'
by_binary = defaultdict(lambda: {'positive': 0, 'flat': 0, 'negative': 0, 'total': 0})
for j in joined:
outcome = categorize_outcome(j['delta_preds'])
by_binary[j['binary']][outcome] += 1
by_binary[j['binary']]['total'] += 1
print()
print(f"{'Group':<15} {'n':<5} {'Positive':<12} {'Flat':<10} {'Negative':<12}")
print("-" * 60)
for binary in ['single', 'multi']:
d = by_binary[binary]
n = d['total']
if n == 0:
continue
pos_pct = d['positive'] / n * 100
flat_pct = d['flat'] / n * 100
neg_pct = d['negative'] / n * 100
print(f"{binary+'-domain':<15} {n:<5} {d['positive']} ({pos_pct:.0f}%) {d['flat']} ({flat_pct:.0f}%) {d['negative']} ({neg_pct:.0f}%)")
# Compute the gap
if by_binary['single']['total'] > 0 and by_binary['multi']['total'] > 0:
single_pos_rate = by_binary['single']['positive'] / by_binary['single']['total'] * 100
multi_pos_rate = by_binary['multi']['positive'] / by_binary['multi']['total'] * 100
gap = single_pos_rate - multi_pos_rate
print()
print(f"Cascade-positive rate gap (single - multi): {gap:+.1f} percentage points")
print()
# Apply pre-registered decision rule
if gap >= 20:
verdict = "NARROWNESS HYPOTHESIS SUPPORTED"
detail = f"Single-domain content is {gap:.0f}pp more likely to gain from cascade than multi-domain."
elif gap <= -20:
verdict = "REVERSE OF HYPOTHESIS"
detail = f"Multi-domain content unexpectedly benefits more (counter to prediction)."
elif abs(gap) < 10:
verdict = "HYPOTHESIS NOT SUPPORTED"
detail = "Domain purity does not appear to predict cascade outcome."
else:
verdict = "INCONCLUSIVE"
detail = f"Gap of {gap:+.0f}pp is suggestive but below the pre-registered 20pp threshold."
print(f" Pre-registered decision rule: {verdict}")
print(f" {detail}")
# SECONDARY TEST: Spearman correlation between purity score and predicate delta
print()
print("=" * 100)
print("SECONDARY TEST: Spearman rank correlation (purity score vs predicate delta)")
print("=" * 100)
scores = [j['score'] for j in joined]
deltas_pred = [j['delta_preds'] for j in joined]
deltas_edge = [j['delta_edges'] for j in joined]
rho_pred = spearman(scores, deltas_pred)
rho_edge = spearman(scores, deltas_edge)
print()
print(f" Spearman ρ (purity score vs Δpredicates): {rho_pred:.3f}")
print(f" Spearman ρ (purity score vs Δedges): {rho_edge:.3f}")
print()
if rho_pred is not None:
if rho_pred >= 0.4:
v = "STRONG POSITIVE — narrowness hypothesis supported with monotonic relationship"
elif rho_pred >= 0.2:
v = "WEAK POSITIVE — consistent with hypothesis but not strong evidence"
elif rho_pred <= -0.2:
v = "NEGATIVE — refutes hypothesis"
else:
v = "NO CORRELATION — hypothesis not supported"
print(f" Predicate delta verdict: {v}")
print()
# TERTIARY TEST: within-bucket correlation
print()
print("=" * 100)
print("TERTIARY TEST: Within-bucket correlation")
print("=" * 100)
by_bucket = defaultdict(list)
for j in joined:
by_bucket[j['bucket']].append(j)
print()
print(f"{'Bucket':<12} {'n':<5} {'Single':<10} {'Multi':<10} {'ρ (score vs Δpred)':<22}")
print("-" * 75)
for bucket in ['high', 'mid', 'low', 'document']:
items = by_bucket.get(bucket, [])
if not items:
continue
n = len(items)
n_single = sum(1 for j in items if j['binary'] == 'single')
n_multi = sum(1 for j in items if j['binary'] == 'multi')
if n >= 3:
scores_b = [j['score'] for j in items]
deltas_b = [j['delta_preds'] for j in items]
rho_b = spearman(scores_b, deltas_b)
rho_str = f"{rho_b:+.3f}" if rho_b is not None else "n/a (no variance)"
else:
rho_str = "n/a (too few)"
print(f"{bucket:<12} {n:<5} {n_single:<10} {n_multi:<10} {rho_str}")
# Interaction with bucket: do single/multi outcomes differ within bucket?
print()
print("Per-bucket cascade-positive rate by binary purity:")
print()
print(f"{'Bucket':<12} {'Single':<25} {'Multi':<25}")
print("-" * 65)
for bucket in ['high', 'mid', 'low', 'document']:
items = by_bucket.get(bucket, [])
if not items:
continue
single_items = [j for j in items if j['binary'] == 'single']
multi_items = [j for j in items if j['binary'] == 'multi']
def rate_str(group):
if not group:
return ""
pos = sum(1 for j in group if j['delta_preds'] > 0)
return f"{pos}/{len(group)} positive ({pos/len(group)*100:.0f}%)"
print(f"{bucket:<12} {rate_str(single_items):<25} {rate_str(multi_items):<25}")
# MEAN DELTA by binary group
print()
print("=" * 100)
print("MEAN PREDICATE DELTA BY GROUP")
print("=" * 100)
print()
for binary in ['single', 'multi']:
items = [j for j in joined if j['binary'] == binary]
if not items:
continue
n = len(items)
mean_dp = sum(j['delta_preds'] for j in items) / n
mean_de = sum(j['delta_edges'] for j in items) / n
sum_pp = sum(j['prod_preds'] for j in items)
sum_cp = sum(j['cascade_preds'] for j in items)
pct_change = (sum_cp - sum_pp) / sum_pp * 100 if sum_pp else 0
print(f"{binary}-domain (n={n}):")
print(f" Mean Δpredicates per source: {mean_dp:+.2f}")
print(f" Mean Δedges per source: {mean_de:+.2f}")
print(f" Aggregate predicate change: {sum_pp}{sum_cp} ({pct_change:+.1f}%)")
print()
# Save joined data for the experiments log writeup
out_path = "/home/aaron/aaronai/experiments/e16_joined_analysis.json"
with open(out_path, "w") as f:
json.dump(joined, f, indent=2)
print(f"Joined data saved to {out_path}")
if __name__ == "__main__":
main()
+206
View File
@@ -0,0 +1,206 @@
"""
E1.6 domain-purity rating interface — with full metadata context.
"""
import json
import os
import random
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
RATINGS_OUT = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
INTRO = """
================================================================================
E1.6 — DOMAIN-PURITY RATING
================================================================================
Two ratings per source:
1. BINARY — single-domain (s) or multi-domain (m)?
Mental test: "If Mistral had to pick ONE domain class for this source,
would picking just one significantly UNDER-DESCRIBE the content?"
YES → MULTI-DOMAIN (m) — content lives across two+ frames meaningfully
NO → SINGLE-DOMAIN (s) — content fits cleanly within one frame
2. SCORE (1-5) — how cleanly does it fit?
5 = unambiguously one domain
4 = primarily one domain, slight other element
3 = balanced two-domain
2 = primarily two-domain with traces of a third
1 = three or more domain frames weighted significantly
Single binary usually = score 4-5
Multi binary usually = score 1-3
You see for each source: name, length, AND the full Mistral metadata block
(domain_class, primary_format, structural_signals, content_signals, summary).
Blind to: bucket assignment, cascade outcome.
Commands at any prompt: 's', 'm', 'skip', 'quit'
================================================================================
""".strip()
def load_existing():
if os.path.exists(RATINGS_OUT):
with open(RATINGS_OUT) as f:
return json.load(f)
return {"ratings": [], "completed_names": []}
def save(data):
with open(RATINGS_OUT, "w") as f:
json.dump(data, f, indent=2)
def render_metadata(metadata):
"""Pretty-print the full Mistral metadata block."""
if not isinstance(metadata, dict):
print(" (metadata unavailable)")
return
if 'error' in metadata:
print(f" (metadata error: {metadata['error']})")
return
# Render fields in a stable order
field_order = [
'domain_class',
'primary_format',
'structural_signals',
'content_signals',
'summary',
]
for field in field_order:
if field in metadata:
value = metadata[field]
label = field.replace('_', ' ').title()
if isinstance(value, list):
if value:
print(f" {label}:")
for item in value:
print(f" - {item}")
else:
print(f" {label}: (none)")
elif isinstance(value, str):
# Wrap long strings
if len(value) > 70:
print(f" {label}:")
print(f" {value}")
else:
print(f" {label}: {value}")
else:
print(f" {label}: {value}")
# Show any other fields not in the standard order
other_fields = [k for k in metadata.keys() if k not in field_order and k != 'char_length']
for field in other_fields:
value = metadata[field]
label = field.replace('_', ' ').title()
print(f" {label}: {value}")
def render_source(src, idx, total):
print()
print("=" * 80)
print(f" Source {idx}/{total}")
print("=" * 80)
print(f"Name: {src['name']}")
print(f"Length: {src['doc_chars']:,} chars")
print()
print("Mistral metadata:")
print()
render_metadata(src.get('metadata', {}))
print()
print("-" * 80)
def get_rating():
while True:
binary = input("Single-domain or multi-domain? [s/m/skip/quit]: ").strip().lower()
if binary in ('s', 'm', 'skip', 'quit'):
break
print(" Please enter 's', 'm', 'skip', or 'quit'")
if binary == 'quit':
return 'quit'
if binary == 'skip':
return None
while True:
try:
score_input = input("Purity score (1=many frames, 5=clearly single): ").strip()
if score_input.lower() == 'quit':
return 'quit'
score = int(score_input)
if 1 <= score <= 5:
break
print(" Score must be 1-5")
except ValueError:
print(" Please enter a number 1-5 (or 'quit')")
note = input("Optional note (Enter to skip): ").strip()
return {
"binary": "single" if binary == 's' else "multi",
"score": score,
"note": note if note else None,
}
def main():
with open(E14_RESULTS) as f:
e14 = json.load(f)
sources = [r for r in e14['results'] if 'submit_result' in r]
rng = random.Random(42)
shuffled = list(sources)
rng.shuffle(shuffled)
state = load_existing()
completed = set(state['completed_names'])
remaining = [s for s in shuffled if s['name'] not in completed]
print(INTRO)
print()
print(f"Total sources: {len(sources)}")
print(f"Already rated: {len(completed)}")
print(f"Remaining: {len(remaining)}")
print()
if not remaining:
print("All sources rated. Run analysis script next.")
return
input("Press Enter to begin...")
try:
for i, src in enumerate(remaining, start=len(completed) + 1):
render_source(src, i, len(sources))
try:
rating = get_rating()
except (KeyboardInterrupt, EOFError):
print("\n\nSaving and exiting...")
save(state)
return
if rating == 'quit':
print("\nSaving and exiting...")
save(state)
return
if rating is None:
print(" Skipped")
continue
rating['name'] = src['name']
state['ratings'].append(rating)
state['completed_names'].append(src['name'])
save(state)
print(f" Recorded: {rating['binary']}-domain, score={rating['score']}")
print()
print("=" * 80)
print(f"Done. Rated {len(state['ratings'])} sources.")
print(f"Saved to {RATINGS_OUT}")
except (KeyboardInterrupt, EOFError):
print("\n\nSaving...")
save(state)
if __name__ == "__main__":
main()
+134
View File
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""E1 metrics comparison — A (Tier 1 aaron) vs B (cascade aaron_cascade_test) on the 10 sample sources."""
import json
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
COMPARISON_FILE = EXPERIMENTS / "cascade_reextract_comparison.json"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def parse_int_result(output):
"""Parse a single-integer result from redis-cli GRAPH.QUERY output."""
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
def parse_string_list(output):
"""Parse a list of strings from redis-cli output (skipping headers and timing)."""
lines = [l.strip() for l in output.split("\n") if l.strip()]
items = []
started = False
for line in lines:
if line.startswith("Cached") or line.startswith("Query internal"):
break
if started:
items.append(line)
# The header is the column name; everything after is data
# But we don't know column names a priori, so detect transition by length pattern
if not started and len(line) < 60 and not any(c in line for c in "{}[]"):
# Likely a header row, skip first one
started = True
return items
def metrics_for_source(group_id, source_name):
"""Get metrics for one source's episode in one group_id."""
# Total entities connected to this episode
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity) RETURN count(distinct n) AS entities'
entities = parse_int_result(query(group_id, q))
# Total edges from this episode (all relationship types)
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[r]-() RETURN count(r) AS edges'
edges = parse_int_result(query(group_id, q))
# Distinct relationship types in edges from entities of this episode
q = (f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity)-[r]-() '
f'RETURN count(distinct type(r)) AS types')
rel_types = parse_int_result(query(group_id, q))
return {"entities": entities, "edges": edges, "rel_types": rel_types}
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 metrics comparison — {len(selected)} sources, A=aaron vs B=aaron_cascade_test\n")
print(f"{'Source':<60} {'A.ent':>6} {'B.ent':>6} {'A.edg':>6} {'B.edg':>6} {'A.typ':>6} {'B.typ':>6}")
print("-" * 110)
results = []
for ep in selected:
name = ep["name"]
bucket = ep["bucket"]
a = metrics_for_source("aaron", name)
b = metrics_for_source("aaron_cascade_test", name)
record = {
"name": name, "bucket": bucket,
"a_entities": a["entities"], "b_entities": b["entities"],
"a_edges": a["edges"], "b_edges": b["edges"],
"a_rel_types": a["rel_types"], "b_rel_types": b["rel_types"],
}
results.append(record)
# Truncate name for display
display_name = name if len(name) <= 58 else name[:55] + "..."
print(f"{display_name:<60} {a['entities']:>6} {b['entities']:>6} {a['edges']:>6} {b['edges']:>6} {a['rel_types']:>6} {b['rel_types']:>6}")
# Aggregates
print("\n" + "=" * 110)
n = len(results)
a_ent_sum = sum(r["a_entities"] for r in results)
b_ent_sum = sum(r["b_entities"] for r in results)
a_edge_sum = sum(r["a_edges"] for r in results)
b_edge_sum = sum(r["b_edges"] for r in results)
a_types_sum = sum(r["a_rel_types"] for r in results)
b_types_sum = sum(r["b_rel_types"] for r in results)
print(f"\nAggregate (n={n}):")
print(f" Entities: A mean={a_ent_sum/n:.1f} B mean={b_ent_sum/n:.1f} delta={(b_ent_sum-a_ent_sum)/a_ent_sum*100:+.1f}%")
print(f" Edges: A mean={a_edge_sum/n:.1f} B mean={b_edge_sum/n:.1f} delta={(b_edge_sum-a_edge_sum)/a_edge_sum*100:+.1f}%")
print(f" Rel types: A mean={a_types_sum/n:.1f} B mean={b_types_sum/n:.1f} delta={(b_types_sum-a_types_sum)/a_types_sum*100:+.1f}%")
# Global predicate diversity check (unique types in each group_id)
print(f"\nGlobal predicate diversity:")
a_global = parse_int_result(query("aaron", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
b_global = parse_int_result(query("aaron_cascade_test", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
print(f" A (aaron): {a_global} distinct relationship types across whole graph")
print(f" B (aaron_cascade_test): {b_global} distinct relationship types across whole graph")
# Per-bucket
print(f"\nPer-bucket aggregates:")
for bucket in ["high", "mid", "low", "document"]:
bucket_results = [r for r in results if r["bucket"] == bucket]
if not bucket_results:
continue
bn = len(bucket_results)
a_e = sum(r["a_entities"] for r in bucket_results) / bn
b_e = sum(r["b_entities"] for r in bucket_results) / bn
a_ed = sum(r["a_edges"] for r in bucket_results) / bn
b_ed = sum(r["b_edges"] for r in bucket_results) / bn
print(f" [{bucket:>8}] n={bn} A.ent={a_e:.1f} B.ent={b_e:.1f} ({(b_e-a_e)/a_e*100:+.0f}%) "
f"A.edg={a_ed:.1f} B.edg={b_ed:.1f} ({(b_ed-a_ed)/a_ed*100:+.0f}%)")
with open(COMPARISON_FILE, "w") as f:
json.dump({
"results": results,
"aggregate": {
"a_entities_total": a_ent_sum, "b_entities_total": b_ent_sum,
"a_edges_total": a_edge_sum, "b_edges_total": b_edge_sum,
"global_predicate_diversity": {"a": a_global, "b": b_global},
},
}, f, indent=2)
print(f"\nSaved to {COMPARISON_FILE}")
if __name__ == "__main__":
main()
+115
View File
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
"""E1 corrected metric — count distinct predicate names on edges originating from each episode."""
import json
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def get_episode_uuid(group_id, episode_name):
"""Look up the UUID for a given episode name in a given group."""
# Escape single quotes in the name
safe = episode_name.replace("'", "\\'")
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
output = query(group_id, cypher)
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
# UUID format check
if len(line) == 36 and line.count("-") == 4:
return line
return None
def count_predicates_for_episode(group_id, uuid):
"""Count distinct predicate names on edges where this episode UUID appears in r.episodes."""
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
output = query(group_id, cypher)
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
def count_total_edges_for_episode(group_id, uuid):
"""Count total edges originating from this episode."""
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
output = query(group_id, cypher)
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 corrected per-source comparison — predicates per episode by edge origin\n")
print(f"{'Source':<60} {'A.edges':>8} {'A.preds':>8} {'B.edges':>8} {'B.preds':>8}")
print("-" * 100)
a_pred_total = 0
b_pred_total = 0
a_edge_total = 0
b_edge_total = 0
records = []
for ep in selected:
name = ep["name"]
a_uuid = get_episode_uuid("aaron", name)
b_uuid = get_episode_uuid("aaron_cascade_test", name)
a_edges = count_total_edges_for_episode("aaron", a_uuid) if a_uuid else 0
a_preds = count_predicates_for_episode("aaron", a_uuid) if a_uuid else 0
b_edges = count_total_edges_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
b_preds = count_predicates_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
display = name if len(name) <= 58 else name[:55] + "..."
print(f"{display:<60} {a_edges:>8} {a_preds:>8} {b_edges:>8} {b_preds:>8}")
records.append({
"name": name, "bucket": ep["bucket"],
"a_edges": a_edges, "a_preds": a_preds,
"b_edges": b_edges, "b_preds": b_preds,
})
a_pred_total += a_preds
b_pred_total += b_preds
a_edge_total += a_edges
b_edge_total += b_edges
print("-" * 100)
n = len(selected)
print(f"\nAggregate (n={n}):")
print(f" Edges: A total={a_edge_total} mean={a_edge_total/n:.1f} B total={b_edge_total} mean={b_edge_total/n:.1f}")
print(f" Predicates: A total={a_pred_total} mean={a_pred_total/n:.1f} B total={b_pred_total} mean={b_pred_total/n:.1f}")
if a_pred_total > 0:
print(f" Predicate delta: B vs A = {(b_pred_total-a_pred_total)/a_pred_total*100:+.1f}%")
if a_edge_total > 0:
print(f" Edge delta: B vs A = {(b_edge_total-a_edge_total)/a_edge_total*100:+.1f}%")
# Per-bucket
print(f"\nPer-bucket:")
for bucket in ["high", "mid", "low", "document"]:
bucket_records = [r for r in records if r["bucket"] == bucket]
if not bucket_records:
continue
bn = len(bucket_records)
a_p = sum(r["a_preds"] for r in bucket_records)
b_p = sum(r["b_preds"] for r in bucket_records)
a_e = sum(r["a_edges"] for r in bucket_records)
b_e = sum(r["b_edges"] for r in bucket_records)
delta = ((b_p-a_p)/a_p*100) if a_p > 0 else 0
print(f" [{bucket:>8}] n={bn} A.preds={a_p:>3} B.preds={b_p:>3} ({delta:+.0f}%) A.edges={a_e:>3} B.edges={b_e:>3}")
with open(EXPERIMENTS / "cascade_reextract_corrected_comparison.json", "w") as f:
json.dump({"per_source": records,
"aggregate": {"a_preds": a_pred_total, "b_preds": b_pred_total,
"a_edges": a_edge_total, "b_edges": b_edge_total}}, f, indent=2)
print(f"\nSaved to {EXPERIMENTS / 'cascade_reextract_corrected_comparison.json'}")
+190
View File
@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""E1 orchestration — fetch source text, run Mistral metadata, submit to Graphiti test group_id."""
import json
import os
import requests
import subprocess
import time
from pathlib import Path
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
PG_DSN = os.environ["PG_DSN"]
SIDECAR_URL = "http://localhost:8001"
TEST_GROUP_ID = "aaron_cascade_test"
MAX_DOC_CHARS = 12000 # Same cap as Tier 1 for parity
# Stage 2 metadata prompt — verbatim from stage-2-worker-spec.md
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
def get_pg():
return psycopg2.connect(PG_DSN)
def fetch_source_text(source):
"""Reassemble the full document from pgvector chunks, mirroring tier1_migration.py logic."""
conn = get_pg()
cur = conn.cursor()
cur.execute("""
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
conn.close()
if row is None or row[0] is None:
return None
return row[0]
def run_mistral_metadata(text):
"""Call local Mistral via Ollama for base-class metadata."""
truncated = text[:MAX_DOC_CHARS]
prompt = METADATA_PROMPT + truncated
response = requests.post(
"http://localhost:11434/api/generate",
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
timeout=180,
)
response.raise_for_status()
raw = response.json()["response"]
try:
metadata = json.loads(raw)
# Override char_length with python-computed value (per stage-2-worker-spec)
metadata["char_length"] = len(truncated)
return metadata
except json.JSONDecodeError:
return {"error": "JSON parse failed", "raw": raw[:500]}
def format_metadata_as_orientation(metadata):
"""Format the base-class metadata as a source_description for Graphiti, with orient-not-bound framing."""
if "error" in metadata:
return f"tier1_cascade_test (metadata generation failed: {metadata['error']})"
summary = metadata.get("one_sentence_summary", "")
domain = metadata.get("domain_class", "unknown")
fmt = metadata.get("primary_format", "unknown")
return (
f"This is a {domain} document in {fmt} format. "
f"Summary: {summary} "
f"This metadata is provided to orient your extraction, not to constrain it. "
f"Extract entities and relationships freely from the document text itself; "
f"the metadata is descriptive context, not a checklist."
)
def submit_episode(name, content, source_description):
"""Submit episode to Graphiti sidecar at the test group_id."""
payload = {
"episodes": [{
"name": name,
"content": content[:MAX_DOC_CHARS],
"source_description": source_description,
"timestamp": "2026-04-28T00:00:00",
}],
"group_id": TEST_GROUP_ID,
}
response = requests.post(f"{SIDECAR_URL}/episodes/bulk", json=payload, timeout=300)
response.raise_for_status()
return response.json()
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 cascade re-extraction starting — {len(selected)} episodes to test group_id={TEST_GROUP_ID}\n")
results = []
for i, ep in enumerate(selected, 1):
name = ep["name"]
bucket = ep["bucket"]
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
# Fetch text
print(f" Fetching source text...", end=" ", flush=True)
text = fetch_source_text(name)
if text is None:
print("FAILED — no chunks in pgvector")
record["error"] = "no source text"
results.append(record)
continue
record["doc_chars"] = len(text)
print(f"{len(text)} chars")
# Mistral metadata
print(f" Generating Mistral metadata...", end=" ", flush=True)
t0 = time.time()
metadata = run_mistral_metadata(text)
elapsed = time.time() - t0
record["metadata"] = metadata
record["metadata_elapsed_s"] = round(elapsed, 1)
if "error" in metadata:
print(f"FAILED in {elapsed:.1f}s")
else:
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
# Submit to Graphiti
source_desc = format_metadata_as_orientation(metadata)
record["source_description"] = source_desc
print(f" Submitting to Graphiti test group...", end=" ", flush=True)
t0 = time.time()
try:
result = submit_episode(name, text, source_desc)
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — OK")
record["submit_elapsed_s"] = round(elapsed, 1)
record["submit_result"] = result
except Exception as e:
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — FAILED: {e}")
record["submit_error"] = str(e)
results.append(record)
# Save intermediate state after each episode
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
print()
print(f"\nDone. Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()
+181
View File
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""E1 corrected re-run — cascade orientation passed via custom_extraction_instructions."""
import json
import os
import requests
import time
from pathlib import Path
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
PG_DSN = os.environ["PG_DSN"]
SIDECAR_URL = "http://localhost:8001"
TEST_GROUP_ID = "aaron_cascade_test"
MAX_DOC_CHARS = 12000
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
def get_pg():
return psycopg2.connect(PG_DSN)
def fetch_source_text(source):
conn = get_pg()
cur = conn.cursor()
cur.execute("""
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
conn.close()
if row is None or row[0] is None:
return None
return row[0]
def run_mistral_metadata(text):
truncated = text[:MAX_DOC_CHARS]
prompt = METADATA_PROMPT + truncated
response = requests.post(
"http://localhost:11434/api/generate",
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
timeout=180,
)
response.raise_for_status()
raw = response.json()["response"]
try:
metadata = json.loads(raw)
metadata["char_length"] = len(truncated)
return metadata
except json.JSONDecodeError:
return {"error": "JSON parse failed", "raw": raw[:500]}
def format_metadata_as_orientation(metadata):
"""Format metadata as orient-not-bound extraction instructions."""
if "error" in metadata:
return None
summary = metadata.get("one_sentence_summary", "")
domain = metadata.get("domain_class", "unknown")
fmt = metadata.get("primary_format", "unknown")
return (
f"This is a {domain} document in {fmt} format. "
f"Summary: {summary} "
f"This metadata is provided to orient your extraction, not to constrain it. "
f"Extract entities and relationships freely from the document text itself; "
f"the metadata is descriptive context, not a checklist."
)
def submit_episode_singular(name, content, custom_instructions):
"""Submit episode to Graphiti's singular /episodes endpoint with cascade orientation."""
payload = {
"name": name,
"content": content[:MAX_DOC_CHARS],
"source_description": "e1_corrected_run", # neutral label, not the cascade text
"timestamp": "2026-04-28T00:00:00",
"group_id": TEST_GROUP_ID,
"custom_extraction_instructions": custom_instructions,
}
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
response.raise_for_status()
return response.json()
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 CORRECTED re-run — {len(selected)} episodes via /episodes (singular)")
print(f"Cascade orientation passed in custom_extraction_instructions.\n")
results = []
for i, ep in enumerate(selected, 1):
name = ep["name"]
bucket = ep["bucket"]
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
print(f" Fetching source text...", end=" ", flush=True)
text = fetch_source_text(name)
if text is None:
print("FAILED — no chunks in pgvector")
record["error"] = "no source text"
results.append(record)
continue
record["doc_chars"] = len(text)
print(f"{len(text)} chars")
print(f" Generating Mistral metadata...", end=" ", flush=True)
t0 = time.time()
metadata = run_mistral_metadata(text)
elapsed = time.time() - t0
record["metadata"] = metadata
record["metadata_elapsed_s"] = round(elapsed, 1)
if "error" in metadata:
print(f"FAILED in {elapsed:.1f}s")
else:
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
custom_instructions = format_metadata_as_orientation(metadata)
record["custom_extraction_instructions"] = custom_instructions
print(f" Submitting via /episodes (singular) with custom_extraction_instructions...", end=" ", flush=True)
t0 = time.time()
try:
result = submit_episode_singular(name, text, custom_instructions)
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — OK")
record["submit_elapsed_s"] = round(elapsed, 1)
record["submit_result"] = result
except Exception as e:
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — FAILED: {e}")
record["submit_error"] = str(e)
results.append(record)
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
print()
print(f"\nDone. Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""E1 sample selection — pick 10 episodes from Tier 1 stratified by density and type."""
import json
import os
import subprocess
from pathlib import Path
from collections import defaultdict
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
OUTPUT = EXPERIMENTS / "cascade_reextract_sample.json"
# Get all Tier 1 episodes with their entity counts via FalkorDB
def query_episode_counts():
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
"RETURN e.name AS name, count(distinct n) AS entities "
"ORDER BY entities DESC")
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
# Parse the output — redis-cli returns rows after a header
lines = [l for l in result.stdout.split("\n") if l.strip()]
episodes = []
# Skip header rows ("name", "entities") and timing rows
i = 0
while i < len(lines):
if lines[i] == "name":
i += 2 # skip "name" and "entities" headers
continue
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
break
# Each episode: name on one line, count on next
if i + 1 < len(lines):
try:
count = int(lines[i + 1])
episodes.append({"name": lines[i], "entities": count})
i += 2
except ValueError:
i += 1
else:
i += 1
return episodes
print("Fetching episode entity counts from FalkorDB...")
episodes = query_episode_counts()
print(f"Got {len(episodes)} episodes")
# Classify by density bucket and type
def is_document(name):
doc_extensions = (".pdf", ".docx", ".pptx", ".txt", ".md")
return any(name.lower().endswith(ext) for ext in doc_extensions)
# Compute quartile boundaries from the entity counts
counts = sorted([e["entities"] for e in episodes], reverse=True)
n = len(counts)
top_q = counts[n // 4] # 25th percentile from top
bottom_q = counts[3 * n // 4] # 75th percentile from top
print(f"\nQuartile boundaries: top={top_q}+, middle=({bottom_q+1}-{top_q-1}), bottom=0-{bottom_q}")
high = [e for e in episodes if e["entities"] >= top_q and not is_document(e["name"])]
mid = [e for e in episodes if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
low = [e for e in episodes if e["entities"] <= bottom_q and not is_document(e["name"])]
docs = [e for e in episodes if is_document(e["name"]) and e["entities"] >= 5]
print(f"High-density conversations: {len(high)}")
print(f"Mid-density conversations: {len(mid)}")
print(f"Low-density conversations: {len(low)}")
print(f"Documents (≥5 entities): {len(docs)}")
# Deterministic selection — take from middle of each bucket to avoid edge cases
def pick(bucket, n):
if len(bucket) < n:
return bucket
mid_idx = len(bucket) // 2
start = max(0, mid_idx - n // 2)
return bucket[start:start + n]
selected = (
pick(high, 3) +
pick(mid, 3) +
pick(low, 2) +
pick(docs, 2)
)
# Tag each with its bucket
def bucket_for(ep):
if is_document(ep["name"]):
return "document"
if ep["entities"] >= top_q:
return "high"
if ep["entities"] > bottom_q:
return "mid"
return "low"
for ep in selected:
ep["bucket"] = bucket_for(ep)
print(f"\nSelected {len(selected)} episodes for E1:")
for ep in selected:
print(f" [{ep['bucket']:>8}] {ep['entities']:>3}e {ep['name']}")
# Save selection
with open(OUTPUT, "w") as f:
json.dump({
"metadata": {
"purpose": "E1 cascade re-extraction sample (n=10)",
"stratification": "density buckets + document subset",
"quartile_top": top_q,
"quartile_bottom": bottom_q,
"total_tier1_episodes": len(episodes),
},
"selected": selected,
}, f, indent=2)
print(f"\nSaved to {OUTPUT}")
+24
View File
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
"""E2 follow-up: confirm Aaron AI alias situation, find other potential duplicates."""
import subprocess
QUERIES = [
("Aaron AI variants",
"MATCH (n:Entity) WHERE n.name CONTAINS 'Aaron AI' OR n.name CONTAINS 'ARIN' OR n.name CONTAINS 'RNAI' RETURN n.name, n.summary"),
("All Mossygear-named entities",
"MATCH (n:Entity) WHERE n.name CONTAINS 'Mossy' OR n.name CONTAINS 'A+K' OR n.name CONTAINS 'AK Design' RETURN n.name, n.summary"),
("Total entity count check",
"MATCH (n:Entity) RETURN count(n) as total"),
("Top 30 entity names by edge count",
"MATCH (n:Entity)-[r]-() RETURN n.name, count(r) as edges ORDER BY edges DESC LIMIT 30"),
]
for label, query in QUERIES:
print(f"\n{'=' * 60}")
print(f"QUERY: {label}")
print('=' * 60)
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
print(result.stdout)
+20
View File
@@ -0,0 +1,20 @@
#!/usr/bin/env python3
"""E2: Entity resolution diagnostic. Queries Graphiti's FalkorDB for the six test entities."""
import subprocess
import sys
TEST_ENTITIES = ["Aaron", "Kat", "HVAMC", "Bird", "Susan Hamlet", "Tulsa album"]
def run_cypher(query):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
return result.stdout
for name in TEST_ENTITIES:
print(f"\n{'=' * 60}")
print(f"ENTITY: {name}")
print('=' * 60)
query = f"MATCH (n:Entity) WHERE n.name CONTAINS '{name}' RETURN n.name, n.summary"
print(run_cypher(query))
+24
View File
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
"""E2 follow-up: how many distinct episodes connect to each entity?"""
import subprocess
QUERIES = [
("Aaron", "MATCH (n:Entity {name: 'Aaron'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Nelson", "MATCH (n:Entity {name: 'Nelson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("HVAMC", "MATCH (n:Entity {name: 'HVAMC'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Bird", "MATCH (n:Entity {name: 'Bird'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Tulsa album", "MATCH (n:Entity {name: 'Tulsa album'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Susan Hamlet", "MATCH (n:Entity {name: 'Susan Hamlet'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Kat", "MATCH (n:Entity {name: 'Kat'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Katherine Wilson","MATCH (n:Entity {name: 'Katherine Wilson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
]
for label, query in QUERIES:
print(f"\n{'=' * 60}")
print(f"ENTITY: {label}")
print('=' * 60)
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
print(result.stdout)
+190
View File
@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""
E1.8 Phase 2 — Evaluate
Pulls predicate counts from FalkorDB for each group_id and compares.
Run after e1_8_taxfree_cascade.py completes.
"""
import json, subprocess
from pathlib import Path
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
EVAL_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
GROUP_TAXFREE = "aaron_e18_taxfree"
GROUP_BASELINE = "aaron_e18_baseline"
GROUP_STANDARD = "aaron_e18_standard"
GROUP_PROD = "aaron"
GROUP_E14 = "aaron_cascade_e14"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def get_episode_uuid(group_id, episode_name):
safe = episode_name.replace("'", "\'")
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
output = query(group_id, cypher)
for line in output.split("\n"):
line = line.strip()
if len(line) == 36 and line.count("-") == 4:
return line
return None
def count_preds(group_id, uuid):
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
output = query(group_id, cypher)
for line in output.split("\n"):
line = line.strip()
if line.isdigit():
return int(line)
return 0
def count_edges(group_id, uuid):
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
output = query(group_id, cypher)
for line in output.split("\n"):
line = line.strip()
if line.isdigit():
return int(line)
return 0
def eval_source(name, groups):
result = {"name": name}
for label, group_id in groups.items():
uuid = get_episode_uuid(group_id, name)
if uuid:
result[f"{label}_preds"] = count_preds(group_id, uuid)
result[f"{label}_edges"] = count_edges(group_id, uuid)
else:
result[f"{label}_preds"] = None
result[f"{label}_edges"] = None
return result
def run():
print("E1.8 — Evaluation phase")
print("=" * 60)
results = json.loads(RESULTS_PATH.read_text())
eval_results = {"subsample_a": [], "subsample_b": []}
# Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
print("\nSub-sample A")
print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
print("-" * 90)
a_records = []
for item in results["subsample_a"]:
name = item["name"]
r = eval_source(name, {
"prod": GROUP_PROD,
"e14": GROUP_E14,
"tf": GROUP_TAXFREE,
})
r["bucket"] = item["bucket"]
r["taxfree_metadata"] = item.get("taxfree_metadata")
r["e14_delta_preds"] = item.get("e14_delta_preds")
prod = r.get("prod_preds") or 0
e14 = r.get("e14_preds") or 0
tf = r.get("tf_preds") or 0
e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
tf_delta = ((tf - prod) / prod * 100) if prod > 0 else 0
display = name[:53] + ".." if len(name) > 55 else name
print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
r["tf_delta_vs_prod"] = tf_delta
r["e14_delta_vs_prod"] = e14_delta
a_records.append(r)
eval_results["subsample_a"].append(r)
# Aggregate Sub-sample A
valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
if valid:
mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
mean_tf_delta = sum(r["tf_delta_vs_prod"] for r in valid) / len(valid)
print(f"\nAggregate Sub-sample A (n={len(valid)}):")
print(f" E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
print(f" Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
print(f" Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
# Sub-sample B — all three conditions
print("\n\nSub-sample B")
print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
print("-" * 90)
b_records = []
for item in results["subsample_b"]:
name = item["name"]
r = eval_source(name, {
"base": GROUP_BASELINE,
"std": GROUP_STANDARD,
"tf": GROUP_TAXFREE,
})
r["bucket"] = item["bucket"]
r["taxfree_metadata"] = item.get("taxfree_metadata")
r["standard_metadata"] = item.get("standard_metadata")
base = r.get("base_preds") or 0
std = r.get("std_preds") or 0
tf = r.get("tf_preds") or 0
std_delta = ((std - base) / base * 100) if base > 0 else 0
tf_delta = ((tf - base) / base * 100) if base > 0 else 0
display = name[:53] + ".." if len(name) > 55 else name
print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
r["std_delta_vs_base"] = std_delta
r["tf_delta_vs_base"] = tf_delta
b_records.append(r)
eval_results["subsample_b"].append(r)
# Aggregate Sub-sample B
valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
if valid_b:
mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
mean_tf_delta = sum(r["tf_delta_vs_base"] for r in valid_b) / len(valid_b)
print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
print(f" Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
print(f" Taxonomy-free mean delta vs baseline: {mean_tf_delta:+.1f}%")
# By bucket
print("\nPer-bucket (Sub-sample B):")
for bucket in ["high", "mid", "document"]:
br = [r for r in valid_b if r["bucket"] == bucket]
if not br:
continue
m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
m_tf = sum(r["tf_delta_vs_base"] for r in br) / len(br)
print(f" [{bucket:>8}] n={len(br)} std={m_std:+.0f}% tf={m_tf:+.0f}%")
# Decision rule evaluation
print("\n" + "=" * 60)
print("DECISION RULE:")
if valid:
improvement = mean_tf_delta - mean_e14_delta
if improvement >= 20:
print(f" ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
elif improvement >= 5:
print(f" ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
elif improvement >= 0:
print(f" ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
else:
print(f" ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
print(f"\nEval saved to {EVAL_PATH}")
if __name__ == "__main__":
run()
+285
View File
@@ -0,0 +1,285 @@
#!/usr/bin/env python3
"""
E1.8 Phase 1 — Ingest
Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
Run this first, then run e1_8_eval.py to pull predicate counts.
"""
import os, json, time, psycopg2, requests
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
PG_DSN = os.getenv("PG_DSN")
GRAPHITI_URL = "http://localhost:8001"
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
GROUP_TAXFREE = "aaron_e18_taxfree"
GROUP_BASELINE = "aaron_e18_baseline"
GROUP_STANDARD = "aaron_e18_standard"
TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.
Do not summarize content. Do not extract entities. Do not assign a single category label.
Instead, describe:
- What domains or frames are active in this content (there may be several simultaneously)
- How those frames relate to each other in this specific document
- What kind of relational content a knowledge graph extractor should look for
Output JSON only. No prose, no explanation, no markdown.
Schema:
{
"active_frames": ["<frame 1>", "<frame 2>", ...],
"frame_relationships": "<one sentence describing how the frames interact in this document>",
"extraction_orientation": "<one sentence orienting the extractor toward the most relationship-rich content>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
SUBSAMPLE_A = [
{"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
{"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
{"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
{"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
{"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
{"name": "Claude: Research Statement Restructure", "bucket": "mid"},
{"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
{"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
{"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
{"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
{"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
{"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
{"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
]
SUBSAMPLE_B = [
{"name": "ChatGPT: Job application comparison", "bucket": "high"},
{"name": "ChatGPT: External review for tenure", "bucket": "high"},
{"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
{"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
{"name": "ChatGPT: Analyze business plan", "bucket": "high"},
{"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
{"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
{"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
{"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
{"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
{"name": "NO thesis proposal.pdf", "bucket": "document"},
{"name": "PWM.pdf", "bucket": "document"},
{"name": "Will_It_Print.pdf", "bucket": "document"},
{"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
{"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
]
def get_pg():
return psycopg2.connect(PG_DSN)
def get_document_text(source_name):
pg = get_pg()
cur = pg.cursor()
cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
rows = cur.fetchall()
pg.close()
return " ".join(r[0] for r in rows)[:12000]
def run_mistral(prompt_prefix, doc_text, label=""):
print(f" → Mistral {label} running...", flush=True)
payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
resp.raise_for_status()
raw = resp.json().get("response", "{}")
print(f" → Mistral {label} done ({len(raw)} chars)", flush=True)
try:
return json.loads(raw)
except Exception:
return {"error": "parse_failed", "raw": raw[:200]}
def build_taxfree_orientation(meta):
frames = ", ".join(meta.get("active_frames", []))
rel = meta.get("frame_relationships", "")
orient = meta.get("extraction_orientation", "")
summary = meta.get("one_sentence_summary", "")
return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"
def build_standard_orientation(meta):
dc = meta.get("domain_class", "unknown")
pf = meta.get("primary_format", "unknown")
summary = meta.get("one_sentence_summary", "")
cs = meta.get("content_signals", {})
return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
f"has_named_people: {cs.get('has_named_people', False)}\n"
f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")
def ingest(source_name, doc_text, orientation, group_id):
payload = {
"episodes": [{
"name": source_name,
"content": doc_text[:12000],
"source_description": orientation,
"timestamp": "2026-04-28T00:00:00",
}],
"group_id": group_id,
}
resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
resp.raise_for_status()
def save(results):
RESULTS_PATH.write_text(json.dumps(results, indent=2))
def run():
print("E1.8 — Ingest phase")
print("=" * 60)
# Load existing results if resuming
if RESULTS_PATH.exists():
results = json.loads(RESULTS_PATH.read_text())
done_a = {r["name"] for r in results.get("subsample_a", [])}
done_b = {r["name"] for r in results.get("subsample_b", [])}
print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
else:
results = {"subsample_a": [], "subsample_b": []}
done_a, done_b = set(), set()
e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
e14_by_name = {s["name"]: s for s in e14_data}
# Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
print("\nSub-sample A — taxonomy-free ingestion only")
for item in SUBSAMPLE_A:
name = item["name"]
if name in done_a:
print(f" SKIP (done): {name}")
continue
print(f"\n {name}")
doc_text = get_document_text(name)
if not doc_text:
print(f" SKIP — no text")
continue
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
orientation = build_taxfree_orientation(tf_meta)
try:
ingest(name, doc_text, orientation, GROUP_TAXFREE)
time.sleep(3)
print(f" ingested to {GROUP_TAXFREE}")
except Exception as e:
print(f" ingest failed: {e}")
continue
e14 = e14_by_name.get(name, {})
results["subsample_a"].append({
"name": name,
"bucket": item["bucket"],
"taxfree_metadata": tf_meta,
"taxfree_orientation": orientation,
"e14_prod_preds": e14.get("prod_preds"),
"e14_cascade_preds": e14.get("cascade_preds"),
"e14_delta_preds": e14.get("delta_preds"),
"e14_prod_edges": e14.get("prod_edges"),
"e14_cascade_edges": e14.get("cascade_edges"),
"e14_delta_edges": e14.get("delta_edges"),
})
save(results)
# Sub-sample B — all three conditions
print("\nSub-sample B — all three conditions")
for item in SUBSAMPLE_B:
name = item["name"]
if name in done_b:
print(f" SKIP (done): {name}")
continue
print(f"\n {name} ({item['bucket']})")
doc_text = get_document_text(name)
if not doc_text:
print(f" SKIP — no text")
continue
entry = {"name": name, "bucket": item["bucket"],
"taxfree_metadata": None, "standard_metadata": None}
# Baseline
try:
ingest(name, doc_text, "", GROUP_BASELINE)
time.sleep(3)
print(f" baseline ingested")
except Exception as e:
print(f" baseline failed: {e}")
# Standard
std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
entry["standard_metadata"] = std_meta
try:
ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
time.sleep(3)
print(f" standard ingested, domain_class={std_meta.get('domain_class','?')}")
except Exception as e:
print(f" standard failed: {e}")
# Taxonomy-free
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
entry["taxfree_metadata"] = tf_meta
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
try:
ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
time.sleep(3)
print(f" taxfree ingested")
except Exception as e:
print(f" taxfree failed: {e}")
results["subsample_b"].append(entry)
save(results)
print("\n" + "=" * 60)
print(f"Ingest complete. Results at {RESULTS_PATH}")
print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")
if __name__ == "__main__":
run()
+204
View File
@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
E1.9 Phase 1 — Retroactive validation
For each E1.8 source, query the production graph with frame_relationships
to get a coverage score, then check whether the routing tier prediction
matches the actual best-performing condition from E1.8.
No API spend required — uses existing E1.8 data and Graphiti search only.
"""
import json, requests
from pathlib import Path
GRAPHITI_URL = "http://localhost:8001"
E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
# Routing thresholds
HIGH_THRESHOLD = 0.70 # baseline
LOW_THRESHOLD = 0.40 # taxonomy-free
def get_coverage_score(query, group_id="aaron"):
"""Query production graph and return coverage score based on result count.
Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
Uses result count because Graphiti fulltext search returns score=0 for all hits.
"""
if not query or not query.strip():
return 0.0
try:
resp = requests.get(
f"{GRAPHITI_URL}/search",
params={"query": query, "limit": 3, "group_id": group_id},
timeout=30
)
resp.raise_for_status()
results = resp.json().get("results", [])
n = len(results)
return min(n / 3.0, 1.0)
except Exception as e:
print(f" Search error: {e}")
return 0.0
def assign_tier(coverage_score):
if coverage_score >= HIGH_THRESHOLD:
return "baseline"
elif coverage_score >= LOW_THRESHOLD:
return "standard"
else:
return "taxfree"
def best_condition_from_e18(record, subsample):
"""
Determine which condition actually performed best for this source in E1.8.
Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
Sub-sample B: compare base, std, tf
"""
if subsample == "a":
prod = record.get("prod_preds") or 0
e14 = record.get("e14_preds") or 0
tf = record.get("tf_preds") or 0
best_score = max(prod, e14, tf)
if best_score == 0:
return "unknown"
if tf == best_score:
return "taxfree"
elif e14 == best_score:
return "standard"
else:
return "baseline"
else:
base = record.get("base_preds") or 0
std = record.get("std_preds") or 0
tf = record.get("tf_preds") or 0
best_score = max(base, std, tf)
if best_score == 0:
return "unknown"
if tf == best_score:
return "taxfree"
elif std == best_score:
return "standard"
else:
return "baseline"
def run():
print("E1.9 Phase 1 — Retroactive validation")
print("=" * 60)
e18_eval = json.loads(E18_PATH.read_text())
e18_ingest = json.loads(E18_INGEST_PATH.read_text())
# Build frame_relationships lookup from ingest results
fr_lookup = {}
for item in e18_ingest.get("subsample_a", []):
meta = item.get("taxfree_metadata", {})
if meta:
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
for item in e18_ingest.get("subsample_b", []):
meta = item.get("taxfree_metadata", {})
if meta:
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
results = []
correct = 0
total = 0
# Sub-sample A
print("\nSub-sample A")
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
print("-" * 95)
for record in e18_eval["subsample_a"]:
name = record["name"]
fr = fr_lookup.get(name, "")
coverage = get_coverage_score(fr)
tier = assign_tier(coverage)
actual_best = best_condition_from_e18(record, "a")
match = "" if tier == actual_best else ""
if actual_best != "unknown":
total += 1
if tier == actual_best:
correct += 1
display = name[:48] + ".." if len(name) > 50 else name
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
results.append({
"name": name, "subsample": "a", "bucket": record.get("bucket"),
"frame_relationships": fr, "coverage_score": coverage,
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
})
# Sub-sample B
print("\nSub-sample B")
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
print("-" * 95)
for record in e18_eval["subsample_b"]:
name = record["name"]
fr = fr_lookup.get(name, "")
coverage = get_coverage_score(fr)
tier = assign_tier(coverage)
actual_best = best_condition_from_e18(record, "b")
match = "" if tier == actual_best else ""
if actual_best != "unknown":
total += 1
if tier == actual_best:
correct += 1
display = name[:48] + ".." if len(name) > 50 else name
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
results.append({
"name": name, "subsample": "b", "bucket": record.get("bucket"),
"frame_relationships": fr, "coverage_score": coverage,
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
})
# Summary
rate = correct / total * 100 if total > 0 else 0
print(f"\n{'=' * 60}")
print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
print()
if rate >= 70:
print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
print(" Proceed to Phase 2 (new ingestion with routing)")
elif rate >= 50:
print("~ MARGINAL — adjust thresholds before Phase 2")
print(" Review mismatch patterns below")
else:
print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
print(" may not be the right signal. Consider active_frames fallback.")
# Mismatch analysis
mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
if mismatches:
print(f"\nMismatches ({len(mismatches)}):")
for r in mismatches:
print(f" [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
# Coverage score distribution
scores = [r["coverage_score"] for r in results]
print(f"\nCoverage score distribution:")
print(f" Mean: {sum(scores)/len(scores):.2f}")
print(f" Min: {min(scores):.2f}")
print(f" Max: {max(scores):.2f}")
high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
mid = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
low = sum(1 for s in scores if s < LOW_THRESHOLD)
print(f" Tier distribution: baseline={high} standard={mid} taxfree={low}")
# Save
output = {
"validation_rate": rate,
"correct": correct,
"total": total,
"thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
"results": results,
}
RESULTS_PATH.write_text(json.dumps(output, indent=2))
print(f"\nSaved to {RESULTS_PATH}")
if __name__ == "__main__":
run()
+257
View File
@@ -0,0 +1,257 @@
#!/usr/bin/env python3
"""
Experiment 005 — Actual API Token Measurement
Measures input token reduction from prepending v2 briefing vs raw document
on Claude Haiku, validating the 42.0% modeled estimate from Experiment 002b.
Outputs: ~/aaronai/experiments/token_measurement_results.json
"""
import json
import os
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
INPUT_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "token_measurement_results.json"
MODEL = "claude-haiku-4-5-20251001"
MAX_TOKENS = 1024
EXTRACTION_PROMPT = (
"Extract entities and their relationships from the document below. "
"Return ONLY valid JSON with this schema:\n"
"{\n"
' "people": [string],\n'
' "organizations": [string],\n'
' "locations": [string],\n'
' "dates": [string],\n'
' "relationships": [{"subject": string, "predicate": string, "object": string}]\n'
"}\n"
"No prose, no markdown fences, no commentary. JSON only."
)
def fetch_document_text(pg_conn, source):
"""Reconstruct the document by concatenating its chunks from pgvector."""
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None
return "\n\n".join(r[0] for r in rows)
def build_raw_message(document_text):
return f"{EXTRACTION_PROMPT}\n\nDOCUMENT:\n{document_text}"
def build_briefed_message(briefing, document_text):
briefing_str = json.dumps(briefing, indent=2)
return (
f"{EXTRACTION_PROMPT}\n\n"
f"BRIEFING (pre-analysis from local model — use to orient):\n{briefing_str}\n\n"
f"DOCUMENT:\n{document_text}"
)
def call_haiku(client, message_text):
t0 = time.time()
resp = client.messages.create(
model=MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": message_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def ci_95(values):
if len(values) < 2:
return (statistics.mean(values) if values else 0.0, 0.0)
mean = statistics.mean(values)
half = 1.96 * statistics.stdev(values) / (len(values) ** 0.5)
return (mean, half)
def main():
if not INPUT_FILE.exists():
print(f"ERROR: {INPUT_FILE} not found", file=sys.stderr)
sys.exit(1)
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
sys.exit(1)
pg_dsn = os.environ.get("PG_DSN")
if not pg_dsn:
print("ERROR: PG_DSN not set", file=sys.stderr)
sys.exit(1)
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
with open(INPUT_FILE) as f:
v2_data = json.load(f)
docs_meta = [
d for d in v2_data["documents"]
if d.get("status") == "SUCCESS"
and d.get("briefing")
]
print(f"Loaded {len(docs_meta)} successful briefings from {INPUT_FILE.name}")
print(f"Model: {MODEL}")
print(f"Calls planned: up to {len(docs_meta) * 2}\n")
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc in enumerate(docs_meta, 1):
source = doc["source"]
briefing = doc["briefing"]
document_text = fetch_document_text(pg_conn, source)
if not document_text:
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]} -- SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]}")
try:
raw_result = call_haiku(client, build_raw_message(document_text))
except Exception as e:
print(f" RAW FAILED: {e}")
raw_result = {"error": str(e)}
try:
briefed_result = call_haiku(client, build_briefed_message(briefing, document_text))
except Exception as e:
print(f" BRIEFED FAILED: {e}")
briefed_result = {"error": str(e)}
delta = None
if "input_tokens" in raw_result and "input_tokens" in briefed_result:
raw_in = raw_result["input_tokens"]
briefed_in = briefed_result["input_tokens"]
raw_out = raw_result["output_tokens"]
briefed_out = briefed_result["output_tokens"]
input_red = (raw_in - briefed_in) / raw_in * 100 if raw_in else 0.0
output_delta = (briefed_out - raw_out) / raw_out * 100 if raw_out else 0.0
delta = {
"input_reduction_pct": round(input_red, 2),
"output_delta_pct": round(output_delta, 2),
"raw_input_tokens": raw_in,
"briefed_input_tokens": briefed_in,
"raw_output_tokens": raw_out,
"briefed_output_tokens": briefed_out,
}
print(
f" in: {raw_in} -> {briefed_in} ({input_red:+.1f}%) | "
f"out: {raw_out} -> {briefed_out}"
)
results.append({
"source": source,
"raw": raw_result,
"briefed": briefed_result,
"delta": delta,
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results if r.get("delta") is not None]
skipped = [r for r in results if r.get("skipped")]
reductions = [r["delta"]["input_reduction_pct"] for r in valid]
output_deltas = [r["delta"]["output_delta_pct"] for r in valid]
raw_in_total = sum(r["delta"]["raw_input_tokens"] for r in valid)
briefed_in_total = sum(r["delta"]["briefed_input_tokens"] for r in valid)
raw_out_total = sum(r["delta"]["raw_output_tokens"] for r in valid)
briefed_out_total = sum(r["delta"]["briefed_output_tokens"] for r in valid)
HAIKU_IN = 1.0
HAIKU_OUT = 5.0
raw_cost = (raw_in_total * HAIKU_IN + raw_out_total * HAIKU_OUT) / 1_000_000
briefed_cost = (briefed_in_total * HAIKU_IN + briefed_out_total * HAIKU_OUT) / 1_000_000
mean_red, ci_half = ci_95(reductions)
mean_out_delta, _ = ci_95(output_deltas)
summary = {
"experiment": "005",
"title": "Actual API Token Measurement",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"model": MODEL,
"extraction_prompt": EXTRACTION_PROMPT,
"n_documents_attempted": len(docs_meta),
"n_skipped_not_in_pgvector": len(skipped),
"n_valid_pairs": len(valid),
"n_failed": len(docs_meta) - len(valid) - len(skipped),
"total_elapsed_s": total_elapsed,
"input_token_reduction": {
"mean_pct": round(mean_red, 2),
"ci_95_half_width_pct": round(ci_half, 2),
"median_pct": round(statistics.median(reductions), 2) if reductions else None,
"min_pct": round(min(reductions), 2) if reductions else None,
"max_pct": round(max(reductions), 2) if reductions else None,
"stdev_pct": round(statistics.stdev(reductions), 2) if len(reductions) > 1 else 0.0,
},
"output_token_delta": {"mean_pct": round(mean_out_delta, 2)},
"totals": {
"raw_input_tokens": raw_in_total,
"briefed_input_tokens": briefed_in_total,
"raw_output_tokens": raw_out_total,
"briefed_output_tokens": briefed_out_total,
"raw_cost_usd": round(raw_cost, 4),
"briefed_cost_usd": round(briefed_cost, 4),
"savings_usd": round(raw_cost - briefed_cost, 4),
},
"comparison_to_v2_estimate": {
"v2_modeled_reduction_pct": 42.0,
"measured_mean_reduction_pct": round(mean_red, 2),
"delta_pct_points": round(mean_red - 42.0, 2),
},
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(docs_meta)} valid pairs in {total_elapsed}s")
if skipped:
print(f"Skipped (not in pgvector): {len(skipped)}")
print(f"Mean input token reduction: {mean_red:.2f}% +/- {ci_half:.2f}% (95% CI)")
print(f"V2 modeled estimate: 42.0% | delta: {mean_red - 42.0:+.2f} pts")
print(f"Mean output token delta: {mean_out_delta:+.2f}%")
print(f"Total cost: ${raw_cost + briefed_cost:.4f}")
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+43 -8
View File
@@ -299,22 +299,57 @@ class IngestHandler(FileSystemEventHandler):
self.pending = False
self.last_event = 0
def on_any_event(self, event):
def _should_ignore(self, path: Path) -> bool:
if path.name.startswith((".", "~$")):
return True
if "Admin/Backups" in str(path) or "Backups" in path.parts:
return True
if "Journal/Media" in str(path):
return True
return False
def on_created(self, event):
if event.is_directory:
return
path = Path(event.src_path)
if path.suffix.lower() not in SUPPORTED:
if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
return
if path.name.startswith((".", "~$")):
log.info(f"Event: created {path}")
self.pending = True
self.last_event = time.time()
def on_modified(self, event):
if event.is_directory:
return
if "Admin/Backups" in str(path) or "Backups" in path.parts:
path = Path(event.src_path)
if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
return
if "Journal/Media" in str(path):
log.info(f"Event: modified {path}")
self.pending = True
self.last_event = time.time()
def on_moved(self, event):
if event.is_directory:
return
if event.event_type not in ("modified", "created", "moved"):
# Nextcloud WebDAV writes .part temp files then renames to final path.
# src_path is the .part file; dest_path is the final filename.
dest = Path(event.dest_path)
if dest.suffix.lower() not in SUPPORTED or self._should_ignore(dest):
return
log.info(f"Event: {event.event_type} {event.src_path}")
self.pending = True
log.info(f"Event: moved -> {dest}")
self.pending = True
self.last_event = time.time()
def on_closed(self, event):
# FileClosedEvent fires on the final file after Nextcloud completes write.
# Belt-and-suspenders catch for any write pattern not caught by on_moved.
if event.is_directory:
return
path = Path(event.src_path)
if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
return
log.info(f"Event: closed {path}")
self.pending = True
self.last_event = time.time()