scripts/: separate production from experimental and deprecated

Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2,
base_class, cascade, cost_test, briefing, consistency, token series).
Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py,
tier1_migration.py — under the bespoke decision both target retired
substrate work).
Removes 19 .bak* files from disk (gitignored, never tracked; git history
is the durable record of every prior version).

The 11 production scripts remain in scripts/. All systemd ExecStart paths,
api.py subprocess calls, and cron jobs continue to resolve correctly —
verified by grep against /etc/systemd/system/aaronai-*.service, scripts/
references in api.py, and the user crontab.

Track 1 inventory cross-cutting finding: scripts/ mixed 11 production
files with 32 experimental scripts and ~20 .bak files. After this commit
a clean-room reader can identify the live workers from a directory listing
alone.

Found by Track 1 inventory 2026-05-02. See
~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning.

After commit, run:
1. git log --oneline -3 — show the new commit on top
2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
This commit is contained in:
2026-05-02 23:28:24 +00:00
parent 6f2d274d5d
commit 3f7fba7e0e
30 changed files with 0 additions and 0 deletions
+193
View File
@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
Audit Expansion Pack Generator — type-aware stratified draw of 12
documents from base_class_validation_results.json for n=20 audit expansion.
Per audit-expansion-protocol.md amendment 2026-04-28:
The seed=43 length-only random draw concentrated on course modules in the
small and medium buckets, missing voice captures, syllabi, and
conversational documents present in the candidate distribution.
This script implements type-aware stratification within each length
bucket to produce a sample representative of BirdAI's document-type mix.
Targets (12 total):
small (4): 2 course_module + 2 voice_capture
medium (4): 2 course_module + 1 syllabus + 1 other
large (4): 1 course_ppt + 1 syllabus + 1 faculty_report + 1 conversational
Output: ~/aaronai/experiments/audit_expansion_pack.json
Usage:
python3 ~/aaronai/scripts/audit_expansion_draw.py
python3 ~/aaronai/scripts/audit_expansion_draw.py --dry-run
"""
import argparse
import json
import random
import re
import sys
import time
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
VALIDATION_RESULTS = EXPERIMENTS / "base_class_validation_results.json"
EXISTING_AUDIT_PACK = EXPERIMENTS / "base_class_audit_pack.json"
OUTPUT_FILE = EXPERIMENTS / "audit_expansion_pack.json"
SEED = 43
# Type-aware targets per bucket
TYPE_TARGETS = {
"small": {"course_module": 2, "voice_capture": 2},
"medium": {"course_module": 2, "syllabus": 1, "other": 1},
"large": {"course_ppt": 1, "syllabus": 1, "faculty_report": 1, "conversational": 1},
}
def classify(source, bucket):
"""Map a source filename to a document type, scoped to bucket where
type categories overlap (e.g., 'course_module' vs 'course_ppt')."""
s = source.lower()
# Voice captures — pattern: YYYY-MM-DD-HH-MM-voice.md
if re.match(r"\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-voice\.md$", source):
return "voice_capture"
# Conversational exports — pattern: "Claude: ..." or "ChatGPT: ..."
if source.startswith("Claude:") or source.startswith("ChatGPT:"):
return "conversational"
# Syllabus — must contain "syllabus" in the name
if "syllabus" in s:
return "syllabus"
# Faculty / annual reports
if "faculty report" in s or "annual report" in s:
return "faculty_report"
# Course PPTs (large bucket) — pattern: "_PPT_" or "_v3.pptx" or "Mod0N_"
if bucket == "large" and (".pptx" in s or "_ppt_" in s or re.match(r"mod\d+_", s)):
return "course_ppt"
# Course modules (small/medium bucket) — pattern: "0N_*.docx" or numeric prefix
if re.match(r"^\d{2}_", source):
return "course_module"
# Everything else falls into 'other' for medium; not used in small/large targets
return "other"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not VALIDATION_RESULTS.exists():
print(f"ERROR: {VALIDATION_RESULTS} not found", file=sys.stderr)
sys.exit(1)
with open(VALIDATION_RESULTS) as f:
validation = json.load(f)
all_docs = validation["results"]
print(f"Loaded {len(all_docs)} documents from validation results")
print(f"Experiment: {validation.get('title', 'unknown')}")
# Load existing audit pack to exclude its sources (audit pack uses 'pairs')
excluded_sources = set()
if EXISTING_AUDIT_PACK.exists():
with open(EXISTING_AUDIT_PACK) as f:
existing = json.load(f)
existing_pairs = existing.get("pairs", existing.get("results", existing))
for doc in existing_pairs:
src = doc.get("source")
if src:
excluded_sources.add(src)
print(f"Excluding {len(excluded_sources)} sources already in audit pack")
# Filter to valid candidates
valid_docs = []
for doc in all_docs:
src = doc.get("source")
if src in excluded_sources:
continue
if not doc.get("condition_a") or not doc.get("condition_b"):
continue
bucket = doc.get("size_bucket")
if bucket not in TYPE_TARGETS:
continue
doc["_type"] = classify(src, bucket)
valid_docs.append(doc)
print(f"Valid candidate documents: {len(valid_docs)}")
# Print what's available per (bucket, type) before drawing
print(f"\nCandidates by (bucket, type):")
for bucket in TYPE_TARGETS:
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
types_in_bucket = {}
for d in bucket_docs:
types_in_bucket.setdefault(d["_type"], []).append(d)
print(f" {bucket}:")
for t in sorted(types_in_bucket.keys()):
target = TYPE_TARGETS[bucket].get(t, "")
print(f" {t:>16}: {len(types_in_bucket[t])} avail, target {target}")
# Stratified type-aware draw
random.seed(SEED)
drawn = []
warnings = []
for bucket, type_targets in TYPE_TARGETS.items():
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
for doc_type, target in type_targets.items():
type_docs = [d for d in bucket_docs if d["_type"] == doc_type]
if len(type_docs) < target:
msg = (f"WARNING: bucket={bucket} type={doc_type} "
f"available={len(type_docs)} target={target}")
warnings.append(msg)
print(msg, file=sys.stderr)
n_to_draw = min(target, len(type_docs))
sample = random.sample(type_docs, n_to_draw)
drawn.extend(sample)
# Report draw
print(f"\nDrew {len(drawn)} documents:")
for d in drawn:
src = d.get("source", "<unknown>")
chars = d.get("doc_chars_original", 0)
bucket = d.get("size_bucket", "?")
doc_type = d.get("_type", "?")
truncated = " (TRUNCATED)" if d.get("truncated") else ""
print(f" [{bucket:>6}/{doc_type:>16}] {chars:>6}c {src}{truncated}")
# Bucket-level summary
bucket_counts = {"small": 0, "medium": 0, "large": 0}
for d in drawn:
bucket_counts[d["size_bucket"]] += 1
print(f"\nBucket totals: {bucket_counts}")
if args.dry_run:
print(f"\n--dry-run set, not writing output file")
return
output = {
"metadata": {
"generated_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
"source_validation_file": str(VALIDATION_RESULTS),
"seed": SEED,
"stratification": "type-aware within length bucket",
"type_targets": TYPE_TARGETS,
"bucket_counts": bucket_counts,
"excluded_count": len(excluded_sources),
"warnings": warnings,
"purpose": "n=20 audit expansion per audit-expansion-protocol.md (type-aware amendment)",
},
"results": drawn,
}
with open(OUTPUT_FILE, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"\nWrote {OUTPUT_FILE}")
print(f" {len(drawn)} documents ready for rating")
if __name__ == "__main__":
main()
@@ -0,0 +1,605 @@
#!/usr/bin/env python3
"""
Base-Class Enrichment Test — OOP Framing Experiment
Tests whether non-entity metadata from a local model (domain class, structural
signals, presence flags, length, summary) can take load off the API without
constraining what it extracts.
The local model does NOT draft entities. The API still does full extraction.
The local model produces metadata that orients the API's reading.
Conditions:
A — Baseline: single Claude Haiku call, full extraction, no metadata
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
If B produces fewer edges or less predicate diversity, metadata is acting as
constraint and the OOP framing is falsified.
Sample: 50 docs from briefing_test_v2_results.json:
- 15 small (<1000 chars)
- 25 medium (1000-5000 chars)
- 10 large (5000-12000 chars, capped at 12K)
Outputs: ~/aaronai/experiments/base_class_audit_rerun_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_audit_rerun_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 8192
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180
MAX_DOC_CHARS = 12000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
Return ONLY valid JSON with this exact schema:
{
"language": "en or other",
"char_length": integer,
"primary_format": "prose, presentation, list, form, code, or mixed",
"structural_signals": {
"has_headings": boolean,
"has_bullet_lists": boolean,
"has_numbered_lists": boolean,
"has_tables": boolean,
"has_code_blocks": boolean,
"has_dates": boolean
},
"content_signals": {
"has_named_people": boolean,
"has_institutional_language": boolean,
"has_technical_terminology": boolean,
"has_first_person": boolean,
"has_quotations": boolean
},
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
}
JSON only, no commentary.
DOCUMENT:
"""
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
DOCUMENT METADATA:
{metadata_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local_metadata(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_METADATA_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph_full(raw):
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None, False
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None, False
if not isinstance(data, dict):
return None, None, False
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return ents, edges, True
return None, None, False
def parse_metadata(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
return json.loads(cleaned)
except json.JSONDecodeError:
return None
def graph_metrics(entities, edges):
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
if entities is None or edges is None:
return None
n_entities = len(entities)
n_edges = len(edges)
# Predicate diversity
predicates = set()
for e in edges:
if isinstance(e, dict):
p = e.get("predicate")
if p:
predicates.add(str(p).strip().lower())
predicate_diversity = len(predicates)
# Entity type diversity
types = set()
for ent in entities:
if isinstance(ent, dict):
t = ent.get("type")
if t:
types.add(str(t).strip().lower())
type_diversity = len(types)
# Average degree (edges*2 / entities — each edge touches two nodes)
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
# Largest connected component
# Build adjacency from edges
entity_names = set()
for ent in entities:
if isinstance(ent, dict):
n = ent.get("name")
if n:
entity_names.add(str(n).strip().lower())
adj = {name: set() for name in entity_names}
for e in edges:
if not isinstance(e, dict):
continue
s = str(e.get("subject", "")).strip().lower()
o = str(e.get("object", "")).strip().lower()
if s in adj and o in adj:
adj[s].add(o)
adj[o].add(s)
# BFS for largest component
visited = set()
largest = 0
for start in adj:
if start in visited:
continue
component = 0
stack = [start]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component += 1
for neighbor in adj[node]:
if neighbor not in visited:
stack.append(neighbor)
if component > largest:
largest = component
return {
"n_entities": n_entities,
"n_edges": n_edges,
"predicate_diversity": predicate_diversity,
"type_diversity": type_diversity,
"avg_degree": round(avg_degree, 2),
"largest_component": largest,
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
}
def stratify(docs):
"""Audit re-run: load the 10 audit docs from base_class_audit_pack.json."""
import json as _json
audit_file = Path.home() / "aaronai" / "experiments" / "base_class_audit_pack.json"
if not audit_file.exists():
print(f"ERROR: {audit_file} not found")
return []
audit = _json.loads(audit_file.read_text())
audit_sources = [p["source"] for p in audit["pairs"]]
# Synthesize doc_meta entries for the audit sources
sample = [{"source": s, "content_length": 0, "status": "SUCCESS"}
for s in audit_sources]
print(f"Audit re-run: {len(sample)} docs from base_class_audit_pack.json")
return sample
def fmt_metrics(m):
if m is None:
return "n/a"
return (f"e={m['n_entities']} edge={m['n_edges']} "
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
print(f" {fmt_metrics(a_metrics)}", flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_metrics = None
# Condition B local metadata pass
local_result = call_local_metadata(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
metadata = parse_metadata(local_raw)
# Override LLM-hallucinated char_length with Python-computed truth
if metadata is not None and isinstance(metadata, dict):
metadata["char_length"] = len(doc_text)
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
flush=True)
if metadata is None:
print(f" B: metadata parse failed — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "metadata_parse_failed",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
print(f" {fmt_metrics(b_metrics)}", flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_metrics = None
# Per-doc deltas
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
pred_pct_str = "n/a"
if a_metrics and b_metrics:
if a_metrics["n_edges"] > 0:
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
if a_metrics["predicate_diversity"] > 0:
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_metadata": metadata,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"metrics": b_metrics,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:32000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("metrics") is not None
and r.get("condition_b", {}).get("metrics") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
def avg_metric(rows, condition, key):
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
return round(statistics.mean(vals), 2) if vals else None
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
by_bucket[bucket] = {
"n": len(rows),
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
}
summary = {
"experiment": "base_class_test",
"title": "Base-Class Enrichment — OOP Framing",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By bucket — graph metrics (A vs B):")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}):")
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
print()
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+593
View File
@@ -0,0 +1,593 @@
#!/usr/bin/env python3
"""
Base-Class Enrichment Test — OOP Framing Experiment
Tests whether non-entity metadata from a local model (domain class, structural
signals, presence flags, length, summary) can take load off the API without
constraining what it extracts.
The local model does NOT draft entities. The API still does full extraction.
The local model produces metadata that orients the API's reading.
Conditions:
A — Baseline: single Claude Haiku call, full extraction, no metadata
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
If B produces fewer edges or less predicate diversity, metadata is acting as
constraint and the OOP framing is falsified.
Sample: 20 docs from briefing_test_v2_results.json:
- 5 small (<1000 chars)
- 10 medium (1000-5000 chars)
- 5 large (5000-12000 chars, capped at 12K)
Outputs: ~/aaronai/experiments/base_class_test_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_test_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 4096
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180
MAX_DOC_CHARS = 12000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
Return ONLY valid JSON with this exact schema:
{
"language": "en or other",
"char_length": integer,
"primary_format": "prose, presentation, list, form, code, or mixed",
"structural_signals": {
"has_headings": boolean,
"has_bullet_lists": boolean,
"has_numbered_lists": boolean,
"has_tables": boolean,
"has_code_blocks": boolean,
"has_dates": boolean
},
"content_signals": {
"has_named_people": boolean,
"has_institutional_language": boolean,
"has_technical_terminology": boolean,
"has_first_person": boolean,
"has_quotations": boolean
},
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
}
JSON only, no commentary.
DOCUMENT:
"""
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
DOCUMENT METADATA:
{metadata_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local_metadata(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_METADATA_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph_full(raw):
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None, False
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None, False
if not isinstance(data, dict):
return None, None, False
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return ents, edges, True
return None, None, False
def parse_metadata(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
return json.loads(cleaned)
except json.JSONDecodeError:
return None
def graph_metrics(entities, edges):
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
if entities is None or edges is None:
return None
n_entities = len(entities)
n_edges = len(edges)
# Predicate diversity
predicates = set()
for e in edges:
if isinstance(e, dict):
p = e.get("predicate")
if p:
predicates.add(str(p).strip().lower())
predicate_diversity = len(predicates)
# Entity type diversity
types = set()
for ent in entities:
if isinstance(ent, dict):
t = ent.get("type")
if t:
types.add(str(t).strip().lower())
type_diversity = len(types)
# Average degree (edges*2 / entities — each edge touches two nodes)
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
# Largest connected component
# Build adjacency from edges
entity_names = set()
for ent in entities:
if isinstance(ent, dict):
n = ent.get("name")
if n:
entity_names.add(str(n).strip().lower())
adj = {name: set() for name in entity_names}
for e in edges:
if not isinstance(e, dict):
continue
s = str(e.get("subject", "")).strip().lower()
o = str(e.get("object", "")).strip().lower()
if s in adj and o in adj:
adj[s].add(o)
adj[o].add(s)
# BFS for largest component
visited = set()
largest = 0
for start in adj:
if start in visited:
continue
component = 0
stack = [start]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component += 1
for neighbor in adj[node]:
if neighbor not in visited:
stack.append(neighbor)
if component > largest:
largest = component
return {
"n_entities": n_entities,
"n_edges": n_edges,
"predicate_diversity": predicate_diversity,
"type_diversity": type_diversity,
"avg_degree": round(avg_degree, 2),
"largest_component": largest,
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
}
def stratify(docs):
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000]
medium = [d for d, n in sized if 1000 <= n < 5000]
large = [d for d, n in sized if n >= 5000]
return small[:5] + medium[:10] + large[:5]
def fmt_metrics(m):
if m is None:
return "n/a"
return (f"e={m['n_entities']} edge={m['n_edges']} "
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (5s/10m/5l, file order)")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
print(f" {fmt_metrics(a_metrics)}", flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_metrics = None
# Condition B local metadata pass
local_result = call_local_metadata(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
metadata = parse_metadata(local_raw)
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
flush=True)
if metadata is None:
print(f" B: metadata parse failed — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "metadata_parse_failed",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
print(f" {fmt_metrics(b_metrics)}", flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_metrics = None
# Per-doc deltas
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
pred_pct_str = "n/a"
if a_metrics and b_metrics:
if a_metrics["n_edges"] > 0:
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
if a_metrics["predicate_diversity"] > 0:
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_metadata": metadata,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"metrics": b_metrics,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:4000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("metrics") is not None
and r.get("condition_b", {}).get("metrics") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
def avg_metric(rows, condition, key):
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
return round(statistics.mean(vals), 2) if vals else None
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
by_bucket[bucket] = {
"n": len(rows),
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
}
summary = {
"experiment": "base_class_test",
"title": "Base-Class Enrichment — OOP Framing",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By bucket — graph metrics (A vs B):")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}):")
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
print()
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
@@ -0,0 +1,611 @@
#!/usr/bin/env python3
"""
Base-Class Enrichment Test — OOP Framing Experiment
Tests whether non-entity metadata from a local model (domain class, structural
signals, presence flags, length, summary) can take load off the API without
constraining what it extracts.
The local model does NOT draft entities. The API still does full extraction.
The local model produces metadata that orients the API's reading.
Conditions:
A — Baseline: single Claude Haiku call, full extraction, no metadata
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
If B produces fewer edges or less predicate diversity, metadata is acting as
constraint and the OOP framing is falsified.
Sample: 50 docs from briefing_test_v2_results.json:
- 15 small (<1000 chars)
- 25 medium (1000-5000 chars)
- 10 large (5000-12000 chars, capped at 12K)
Outputs: ~/aaronai/experiments/base_class_validation_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_validation_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 8192
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180
MAX_DOC_CHARS = 12000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
Return ONLY valid JSON with this exact schema:
{
"language": "en or other",
"char_length": integer,
"primary_format": "prose, presentation, list, form, code, or mixed",
"structural_signals": {
"has_headings": boolean,
"has_bullet_lists": boolean,
"has_numbered_lists": boolean,
"has_tables": boolean,
"has_code_blocks": boolean,
"has_dates": boolean
},
"content_signals": {
"has_named_people": boolean,
"has_institutional_language": boolean,
"has_technical_terminology": boolean,
"has_first_person": boolean,
"has_quotations": boolean
},
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
}
JSON only, no commentary.
DOCUMENT:
"""
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
DOCUMENT METADATA:
{metadata_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local_metadata(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_METADATA_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph_full(raw):
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None, False
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None, False
if not isinstance(data, dict):
return None, None, False
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return ents, edges, True
return None, None, False
def parse_metadata(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
return json.loads(cleaned)
except json.JSONDecodeError:
return None
def graph_metrics(entities, edges):
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
if entities is None or edges is None:
return None
n_entities = len(entities)
n_edges = len(edges)
# Predicate diversity
predicates = set()
for e in edges:
if isinstance(e, dict):
p = e.get("predicate")
if p:
predicates.add(str(p).strip().lower())
predicate_diversity = len(predicates)
# Entity type diversity
types = set()
for ent in entities:
if isinstance(ent, dict):
t = ent.get("type")
if t:
types.add(str(t).strip().lower())
type_diversity = len(types)
# Average degree (edges*2 / entities — each edge touches two nodes)
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
# Largest connected component
# Build adjacency from edges
entity_names = set()
for ent in entities:
if isinstance(ent, dict):
n = ent.get("name")
if n:
entity_names.add(str(n).strip().lower())
adj = {name: set() for name in entity_names}
for e in edges:
if not isinstance(e, dict):
continue
s = str(e.get("subject", "")).strip().lower()
o = str(e.get("object", "")).strip().lower()
if s in adj and o in adj:
adj[s].add(o)
adj[o].add(s)
# BFS for largest component
visited = set()
largest = 0
for start in adj:
if start in visited:
continue
component = 0
stack = [start]
while stack:
node = stack.pop()
if node in visited:
continue
visited.add(node)
component += 1
for neighbor in adj[node]:
if neighbor not in visited:
stack.append(neighbor)
if component > largest:
largest = component
return {
"n_entities": n_entities,
"n_edges": n_edges,
"predicate_diversity": predicate_diversity,
"type_diversity": type_diversity,
"avg_degree": round(avg_degree, 2),
"largest_component": largest,
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
}
def stratify(docs):
"""Pick small + medium from v2; large bucket is loaded separately from
large_bucket_sources.json (sampled fresh from pgvector since v2 has no large docs)."""
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000][:15]
medium = [d for d, n in sized if 1000 <= n < 5000][:25]
# Load large bucket from external sources file
import json as _json
large_sources_file = Path.home() / "aaronai" / "large_bucket_sources.json"
if large_sources_file.exists():
large_source_names = _json.loads(large_sources_file.read_text())
# Synthesize doc_meta entries for the large sources
large = [{"source": s, "content_length": 0, "status": "SUCCESS"}
for s in large_source_names]
print(f"Stratify: 15 small + 25 medium from v2, 10 large from large_bucket_sources.json")
else:
large = []
print("WARN: large_bucket_sources.json not found, no large docs in sample")
return small + medium + large
def fmt_metrics(m):
if m is None:
return "n/a"
return (f"e={m['n_entities']} edge={m['n_edges']} "
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
print(f" {fmt_metrics(a_metrics)}", flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_metrics = None
# Condition B local metadata pass
local_result = call_local_metadata(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
metadata = parse_metadata(local_raw)
# Override LLM-hallucinated char_length with Python-computed truth
if metadata is not None and isinstance(metadata, dict):
metadata["char_length"] = len(doc_text)
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
flush=True)
if metadata is None:
print(f" B: metadata parse failed — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "metadata_parse_failed",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
print(f" {fmt_metrics(b_metrics)}", flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_metrics = None
# Per-doc deltas
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
pred_pct_str = "n/a"
if a_metrics and b_metrics:
if a_metrics["n_edges"] > 0:
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
if a_metrics["predicate_diversity"] > 0:
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"metrics": a_metrics,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:32000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_metadata": metadata,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"metrics": b_metrics,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:32000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("metrics") is not None
and r.get("condition_b", {}).get("metrics") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
def avg_metric(rows, condition, key):
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
return round(statistics.mean(vals), 2) if vals else None
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
by_bucket[bucket] = {
"n": len(rows),
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
}
summary = {
"experiment": "base_class_test",
"title": "Base-Class Enrichment — OOP Framing",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By bucket — graph metrics (A vs B):")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}):")
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
print()
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
@@ -0,0 +1,376 @@
#!/usr/bin/env python3
"""
BirdAI Briefing Generator v2 — Experiment 002b
===============================================
Changes from v1 (based on Experiment 004 human evaluation):
- document_type now pre-classified by rule, not by model
- Capture template header stripped before model sees content
- noise_signals constrained to controlled vocabulary
- Model prompt simplified — focuses only on reliable signal fields
- Expanded document type vocabulary for BirdAI-specific types
Results written to ~/aaronai/briefing_test_v2_results.json
"""
import json
import os
import re
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_v2_results.json")
MODEL = "mistral"
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
VALID_DOC_TYPES = {
"voice_capture", "image_capture",
"dream_nrem", "dream_rem", "dream_lucid", "dream_synthesis",
"presentation", "code", "spreadsheet",
"academic_pdf", "technical_doc", "chat_log",
"book_excerpt", "form", "syllabus", "email",
"notes", "purchase_order", "annual_report",
"invoice", "memo", "report", "unknown"
}
VALID_DENSITIES = {"high", "medium", "low"}
VALID_PRIORITIES = {"full", "partial", "skip"}
VALID_NOISE_SIGNALS = {
"repeated_headers", "page_numbers", "formatting_artifacts",
"boilerplate", "watermarks", "footers", "line_numbers",
"encoding_artifacts", "ocr_errors"
}
VALID_STRUCTURE_SIGNALS = {
"headings", "bullet_lists", "numbered_lists", "tables",
"code_blocks", "citations", "footnotes", "images",
"forms", "columns", "sections"
}
def pre_classify_document(source, content):
filename = os.path.basename(source).lower()
doc_type = None
cleaned_content = content
if "---" in content:
parts = content.split("---", 1)
header = parts[0].lower()
body = parts[1].strip() if len(parts) > 1 else content
if any(marker in header for marker in ["**type:**", "**modality:**", "# capture", "# dream"]):
cleaned_content = body if body else content
if "nrem" in filename:
doc_type = "dream_nrem"
elif "lucid" in filename:
doc_type = "dream_lucid"
elif "-rem-" in filename or filename.endswith("-rem.md"):
doc_type = "dream_rem"
elif "synthesis" in filename and filename.endswith(".md"):
doc_type = "dream_synthesis"
elif "-voice" in filename or "voice-" in filename:
doc_type = "voice_capture"
elif "-image" in filename or "image-" in filename:
doc_type = "image_capture"
elif filename.endswith(".pptx") or filename.endswith(".ppt"):
doc_type = "presentation"
elif filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".csv"):
doc_type = "spreadsheet"
elif any(filename.endswith(ext) for ext in [".py", ".js", ".ts", ".cpp", ".c", ".h", ".java", ".rs"]):
doc_type = "code"
elif filename.endswith("cmakelists.txt") or filename == "makefile":
doc_type = "code"
elif content.startswith("# Dream"):
if "nrem" in content[:50].lower():
doc_type = "dream_nrem"
elif "lucid" in content[:50].lower():
doc_type = "dream_lucid"
elif "rem" in content[:50].lower():
doc_type = "dream_rem"
else:
doc_type = "dream_synthesis"
elif content.startswith("# Capture"):
doc_type = "voice_capture" if "voice" in content[:100].lower() else "image_capture"
return doc_type, cleaned_content
def build_briefing_prompt(content, pre_classified_type=None):
if pre_classified_type:
type_instruction = f'\n "document_type": "{pre_classified_type}", // pre-classified, do not change'
else:
type_instruction = '\n "document_type": "one of: academic_pdf, technical_doc, chat_log, book_excerpt, form, syllabus, email, notes, purchase_order, annual_report, invoice, memo, report, unknown",'
return f"""Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
Return exactly this structure:
{{{type_instruction}
"primary_language": "language code e.g. en, fr, de",
"density": "one of: high, medium, low",
"has_proper_nouns": true or false,
"has_dates": true or false,
"has_numeric_data": true or false,
"has_institutional_language": true or false,
"has_technical_terms": true or false,
"likely_has_named_entities": true or false,
"structure_signals": [],
"noise_signals": [],
"extraction_priority": "one of: full, partial, skip"
}}
Rules:
- density: high=information dense technical or academic, medium=mixed, low=narrative/literary/sparse/short
- has_proper_nouns: true if you see capitalized words that are NOT sentence starts or template headers
- has_dates: true if you see date patterns (numbers with months, years, slashes)
- has_numeric_data: true if you see measurements, percentages, statistics
- has_institutional_language: true if you see words like university, department, policy, committee, grant
- has_technical_terms: true if you see domain-specific jargon or acronyms
- likely_has_named_entities: true if has_proper_nouns is true
- structure_signals: use ONLY these terms: headings, bullet_lists, numbered_lists, tables, code_blocks, citations, footnotes, images, forms, columns, sections
- noise_signals: use ONLY these terms: repeated_headers, page_numbers, formatting_artifacts, boilerplate, watermarks, footers, line_numbers, encoding_artifacts, ocr_errors
- extraction_priority: full if density=high and likely_has_named_entities=true; skip if density=low AND likely_has_named_entities=false AND content is under 200 words; partial otherwise
Document:
{content[:1500]}"""
def get_sample_documents():
if not PG_DSN:
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT DISTINCT ON (source) id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY source, random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_briefing(prompt):
payload = json.dumps({"model": MODEL, "prompt": prompt, "stream": False}).encode()
raw = ""
try:
req = urllib.request.Request(OLLAMA_URL, data=payload, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:200]}"
parsed = json.loads(raw[start:end])
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {raw[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def sanitize_briefing(briefing, pre_classified_type=None):
safe = {}
if pre_classified_type:
safe["document_type"] = pre_classified_type
else:
dt = str(briefing.get("document_type", "unknown")).lower().strip()
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
density = str(briefing.get("density", "medium")).lower().strip()
safe["density"] = density if density in VALID_DENSITIES else "medium"
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
"has_institutional_language", "has_technical_terms", "likely_has_named_entities"]:
val = briefing.get(field, False)
if isinstance(val, bool):
safe[field] = val
elif isinstance(val, str):
safe[field] = val.lower() in ("true", "yes", "1")
else:
safe[field] = bool(val)
for field, valid_set in [("structure_signals", VALID_STRUCTURE_SIGNALS),
("noise_signals", VALID_NOISE_SIGNALS)]:
val = briefing.get(field, [])
if isinstance(val, list):
safe[field] = [str(v).lower().strip() for v in val if str(v).lower().strip() in valid_set]
elif isinstance(val, str) and val.lower().strip() in valid_set:
safe[field] = [val.lower().strip()]
else:
safe[field] = []
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
return safe
def estimate_token_reduction(original_text, briefing):
original_tokens = max(len(original_text) / 4, 1)
orientation_saved = 200
if briefing.get("extraction_priority") == "skip":
return {"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": round(original_tokens + 200),
"noise_reduction_pct": 100.0, "total_reduction_pct": 100.0,
"note": "skip — no API call"}
noise_count = len(briefing.get("noise_signals", []))
noise_reduction_pct = min(noise_count * 0.05, 0.40)
noise_tokens_saved = original_tokens * noise_reduction_pct
total_saved = orientation_saved + noise_tokens_saved
reduction_pct = min((total_saved / (original_tokens + 200)) * 100, 99.0)
return {"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": orientation_saved,
"noise_tokens_saved": round(noise_tokens_saved),
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
"total_reduction_pct": round(reduction_pct, 1)}
def format_eta(elapsed_times, completed, total):
if completed == 0:
return "ETA: --:--"
avg = sum(elapsed_times) / completed
eta = timedelta(seconds=int((total - completed) * avg))
return f"ETA: {str(eta)}"
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
test_start = time.time()
print(f"\nBirdAI Briefing Generator v2 — Experiment 002b")
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
print(f"Changes: rule-based doc_type, template stripping, controlled vocab")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 75)
docs = get_sample_documents()
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
results = {
"meta": {"model": MODEL, "version": "v2", "sample_size": len(docs),
"started": datetime.now().isoformat(), "completed": None,
"total_elapsed_seconds": None, "avg_seconds_per_doc": None},
"documents": [], "summary": {}
}
success_count = 0
failed_count = 0
pre_classified_count = 0
priority_counts = {"full": 0, "partial": 0, "skip": 0}
total_reduction_pct = 0.0
elapsed_times = []
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
pre_type, cleaned_content = pre_classify_document(source, content)
was_pre_classified = pre_type is not None
if was_pre_classified:
pre_classified_count += 1
eta_str = format_eta(elapsed_times, i, len(docs))
pre_flag = "R" if was_pre_classified else "M"
print(f"[{i+1:02d}/{len(docs)}][{pre_flag}] {source[:36]:<36} {eta_str:<14}", end=" ", flush=True)
prompt = build_briefing_prompt(cleaned_content, pre_type)
t_start = time.time()
briefing, raw = run_briefing(prompt)
elapsed = round(time.time() - t_start, 1)
elapsed_times.append(elapsed)
if briefing is None:
failed_count += 1
print(f"→ FAILED {elapsed}s | {raw[:50]}")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "FAILED",
"pre_classified_type": pre_type, "error": raw, "elapsed_seconds": elapsed
})
else:
briefing = sanitize_briefing(briefing, pre_type)
success_count += 1
priority = briefing["extraction_priority"]
doc_type = briefing["document_type"]
density = briefing["density"]
priority_counts[priority] = priority_counts.get(priority, 0) + 1
reduction = estimate_token_reduction(cleaned_content, briefing)
total_reduction_pct += reduction["total_reduction_pct"]
print(f"{priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "cleaned_content_length": len(cleaned_content),
"status": "SUCCESS", "pre_classified_type": pre_type,
"was_pre_classified": was_pre_classified, "elapsed_seconds": elapsed,
"briefing": briefing, "token_reduction_estimate": reduction
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total_elapsed = round(time.time() - test_start, 1)
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
results["meta"]["total_elapsed_seconds"] = total_elapsed
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
total = len(docs)
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
summary = {
"total": total, "success": success_count, "failed": failed_count,
"success_rate": round(success_count / total * 100, 1),
"pre_classified_by_rule": pre_classified_count,
"classified_by_model": total - pre_classified_count,
"extraction_priority_breakdown": priority_counts,
"avg_token_reduction_pct": avg_reduction,
"total_elapsed_seconds": total_elapsed, "avg_seconds_per_doc": avg_per_doc,
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
"approach_viable": success_count / total >= 0.8
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 75)
print(f"RESULTS — Briefing Generator v2")
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
print(f" Failed: {failed_count}")
print(f" Pre-classified (rule): {pre_classified_count}")
print(f" Classified (model): {total - pre_classified_count}")
print(f" Priority — full: {priority_counts.get('full', 0)}")
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
print(f" Avg token reduction: {avg_reduction}%")
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
print(f" Avg per document: {avg_per_doc}s")
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 75)
if __name__ == "__main__":
main()
+313
View File
@@ -0,0 +1,313 @@
#!/usr/bin/env python3
"""
BirdAI Briefing Generator Test
===============================
Tests the local LLM as a document briefing generator.
The local model produces a structured roadmap for the API —
cleaning, structure detection, signal flagging — without semantic judgment.
Results written to ~/aaronai/briefing_test_results.json
"""
import json
import os
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json")
MODEL = "mistral"
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
VALID_DOC_TYPES = {
"academic_pdf", "technical_doc", "chat_log", "code",
"presentation", "book_excerpt", "form", "syllabus",
"email", "notes", "unknown"
}
VALID_DENSITIES = {"high", "medium", "low"}
VALID_PRIORITIES = {"full", "partial", "skip"}
BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
Return exactly this structure:
{
"document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown",
"primary_language": "language code e.g. en, fr, de",
"density": "one of: high, medium, low",
"has_proper_nouns": true or false,
"has_dates": true or false,
"has_numeric_data": true or false,
"has_institutional_language": true or false,
"has_technical_terms": true or false,
"likely_has_named_entities": true or false,
"structure_signals": [],
"noise_signals": [],
"extraction_priority": "one of: full, partial, skip"
}
Rules:
- document_type: identify from formatting patterns and vocabulary, not meaning
- density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse
- has_proper_nouns: true if you see capitalized words that are not sentence starts
- has_dates: true if you see date patterns (numbers with months, years, slashes)
- has_numeric_data: true if you see measurements, percentages, statistics
- has_institutional_language: true if you see words like university, department, policy, committee, grant
- has_technical_terms: true if you see domain-specific jargon or acronyms
- likely_has_named_entities: true if has_proper_nouns is true
- structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"]
- noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"]
- extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise
Document:
"""
def get_sample_documents():
if not PG_DSN:
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT DISTINCT ON (source) id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY source, random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_briefing(text):
prompt = BRIEFING_PROMPT + text[:1500]
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False
}).encode()
raw = ""
try:
req = urllib.request.Request(
OLLAMA_URL,
data=payload,
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:200]}"
json_str = raw[start:end]
parsed = json.loads(json_str)
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {raw[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def sanitize_briefing(briefing):
safe = {}
dt = str(briefing.get("document_type", "unknown")).lower().strip()
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
density = str(briefing.get("density", "medium")).lower().strip()
safe["density"] = density if density in VALID_DENSITIES else "medium"
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
"has_institutional_language", "has_technical_terms",
"likely_has_named_entities"]:
val = briefing.get(field, False)
if isinstance(val, bool):
safe[field] = val
elif isinstance(val, str):
safe[field] = val.lower() in ("true", "yes", "1")
else:
safe[field] = bool(val)
for field in ["structure_signals", "noise_signals"]:
val = briefing.get(field, [])
if isinstance(val, list):
safe[field] = [str(v) for v in val if v]
elif isinstance(val, str):
safe[field] = [val] if val else []
else:
safe[field] = []
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
return safe
def estimate_token_reduction(original_text, briefing):
original_tokens = max(len(original_text) / 4, 1)
orientation_saved = 200
if briefing.get("extraction_priority") == "skip":
return {
"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": round(original_tokens + 200),
"noise_reduction_pct": 100.0,
"total_reduction_pct": 100.0,
"note": "skip — no API call"
}
noise_count = len(briefing.get("noise_signals", []))
noise_reduction_pct = min(noise_count * 0.05, 0.40)
noise_tokens_saved = original_tokens * noise_reduction_pct
total_saved = orientation_saved + noise_tokens_saved
total_cost = original_tokens + 200
reduction_pct = min((total_saved / total_cost) * 100, 99.0)
return {
"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": orientation_saved,
"noise_tokens_saved": round(noise_tokens_saved),
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
"total_reduction_pct": round(reduction_pct, 1)
}
def format_eta(elapsed_times, completed, total):
if completed == 0:
return "ETA: --:--"
avg = sum(elapsed_times) / completed
remaining = (total - completed) * avg
eta = timedelta(seconds=int(remaining))
return f"ETA: {str(eta)}"
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
test_start = time.time()
print(f"\nBirdAI Briefing Generator Test")
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 75)
docs = get_sample_documents()
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
results = {
"meta": {
"model": MODEL,
"sample_size": len(docs),
"started": datetime.now().isoformat(),
"completed": None,
"total_elapsed_seconds": None,
"avg_seconds_per_doc": None
},
"documents": [],
"summary": {}
}
success_count = 0
failed_count = 0
priority_counts = {"full": 0, "partial": 0, "skip": 0}
total_reduction_pct = 0.0
elapsed_times = []
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
eta_str = format_eta(elapsed_times, i, len(docs))
print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True)
t_start = time.time()
briefing, raw = run_briefing(content)
elapsed = round(time.time() - t_start, 1)
elapsed_times.append(elapsed)
if briefing is None:
failed_count += 1
print(f"→ FAILED {elapsed}s | {raw[:50]}")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "FAILED",
"error": raw, "elapsed_seconds": elapsed
})
else:
briefing = sanitize_briefing(briefing)
success_count += 1
priority = briefing["extraction_priority"]
doc_type = briefing["document_type"]
density = briefing["density"]
priority_counts[priority] = priority_counts.get(priority, 0) + 1
reduction = estimate_token_reduction(content, briefing)
total_reduction_pct += reduction["total_reduction_pct"]
print(f"{priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "SUCCESS",
"elapsed_seconds": elapsed, "briefing": briefing,
"token_reduction_estimate": reduction
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total_elapsed = round(time.time() - test_start, 1)
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
results["meta"]["total_elapsed_seconds"] = total_elapsed
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
total = len(docs)
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
summary = {
"total": total,
"success": success_count,
"failed": failed_count,
"success_rate": round(success_count / total * 100, 1),
"extraction_priority_breakdown": priority_counts,
"avg_token_reduction_pct": avg_reduction,
"total_elapsed_seconds": total_elapsed,
"avg_seconds_per_doc": avg_per_doc,
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
"approach_viable": success_count / total >= 0.8
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 75)
print(f"RESULTS")
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
print(f" Failed: {failed_count}")
print(f" Priority — full: {priority_counts.get('full', 0)}")
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
print(f" Avg token reduction: {avg_reduction}%")
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
print(f" Avg per document: {avg_per_doc}s")
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 75)
if __name__ == "__main__":
main()
@@ -0,0 +1,508 @@
#!/usr/bin/env python3
"""
Cascade Optimization Test — skip-small + compressed-draft
Tests whether two optimizations on the entity-drafter cascade meaningfully
improve the savings ceiling beyond the prior unoptimized cascade (12.66%).
Optimizations:
A — Skip-small-docs routing: docs <1000 chars bypass the local pass entirely
B — Compressed draft format: bare JSON array instead of markdown bullets
Conditions:
A — Baseline: single Claude Haiku call, full extraction (unchanged from prior)
B — Optimized cascade: skip-small + compressed draft, otherwise same cascade
Sample: 30 docs from briefing_test_v2_results.json:
- 10 small (<1000 chars) — should show 0% delta if skip-small works
- 12 medium (1000-5000 chars) — primary test bucket
- 8 large (5000-12000 chars, capped at 12K)
Mistral context: 12K (raised from 8K in prior run).
Outputs: ~/aaronai/experiments/cascade_optimization_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_optimization_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 4096
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 180 # raised — 12K context can take longer
MAX_DOC_CHARS = 12000 # raised from 8K
SKIP_SMALL_THRESHOLD = 1000
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
Return ONLY valid JSON:
{
"candidates": [string]
}
Just names. No types, no relationships. JSON only.
DOCUMENT:
"""
# Compressed draft format — bare JSON array, minimal preamble
CONDITION_B_API_PROMPT_COMPRESSED = """Extract a knowledge graph from the document below.
Local model entity candidates (hint, not authoritative — verify against the document, ignore false ones, add missed ones):
{local_draft_json}
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None
if not isinstance(data, dict):
return None, None
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return len(ents), len(edges)
return None, None
def parse_candidates(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None
if not isinstance(data, dict):
return None
cands = data.get("candidates")
if isinstance(cands, list):
return [str(c).strip() for c in cands if c]
return None
def stratify(docs):
"""Pick 10 small / 12 medium / 8 large by character length, in file order."""
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000]
medium = [d for d, n in sized if 1000 <= n < 5000]
large = [d for d, n in sized if n >= 5000]
return small[:10] + medium[:12] + large[:8]
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (10s/12m/8l, file order)")
print(f"Skip-small threshold: <{SKIP_SMALL_THRESHOLD} chars")
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
skip_small_routed = sent_len < SKIP_SMALL_THRESHOLD
trunc_marker = "*" if truncated else " "
route_marker = "[skip-small]" if skip_small_routed else "[cascade] "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] "
f"{route_marker} {source[:50]}", flush=True)
# Condition A — always runs
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges = parse_graph(a["response_text"])
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_ents = a_edges = None
# Condition B
if skip_small_routed:
# Skip-small: B = A. Same call, no local pass.
print(f" B: routed to baseline (skip-small)", flush=True)
b = a
b_ents = a_ents
b_edges = a_edges
local_result = {"skipped": "skip_small_routed"}
local_candidates = []
local_raw = ""
else:
local_result = call_local(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']} — recording skip", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"skip_small_routed": False,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
cands = parse_candidates(local_raw)
local_candidates = cands or []
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
flush=True)
if not local_candidates:
print(f" B local: empty draft — skipping API call", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"skip_small_routed": False,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_draft_empty",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
# Compressed draft format — bare JSON array
local_draft_json = json.dumps(local_candidates, ensure_ascii=False)
b_prompt = CONDITION_B_API_PROMPT_COMPRESSED.replace("{local_draft_json}", local_draft_json) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges = parse_graph(b["response_text"])
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_ents = b_edges = None
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
if a_edges and b_edges is not None and a_edges > 0:
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"skip_small_routed": skip_small_routed,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skip_small_routed": skip_small_routed,
"local_latency_s": local_result.get("latency_s"),
"local_candidates": local_candidates,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"entity_count": b_ents,
"edge_count": b_edges,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:4000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("input_tokens") is not None
and r.get("condition_b", {}).get("api_input_tokens") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
skip_count = sum(1 for r in rows if r.get("skip_small_routed"))
by_bucket[bucket] = {
"n": len(rows),
"n_skip_small_routed": skip_count,
"n_cascade": len(rows) - skip_count,
"a_input_tokens": ai,
"a_output_tokens": ao,
"b_input_tokens": bi,
"b_output_tokens": bo,
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
}
summary = {
"experiment": "cascade_optimization_test",
"title": "Cascade Optimization — skip-small + compressed-draft",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"haiku_temperature": HAIKU_TEMPERATURE,
"haiku_max_tokens": HAIKU_MAX_TOKENS,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"skip_small_threshold": SKIP_SMALL_THRESHOLD,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"n_skipped": len(sample) - len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"prior_unoptimized_cascade_pct": -12.66,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
opt_delta = delta_pct - (-12.66)
print(f"Optimization delta vs prior cascade: {opt_delta:+.2f} points "
f"(prior was -12.66%)")
print()
print("By size bucket:")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}, skip={stats['n_skip_small_routed']}): "
f"in {stats['input_delta_pct']:+.1f}% "
f"out {stats['output_delta_pct']:+.1f}% "
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print()
print("Results: " + str(OUTPUT_FILE))
if __name__ == "__main__":
main()
+485
View File
@@ -0,0 +1,485 @@
#!/usr/bin/env python3
"""
Cascade Test — Nodes-vs-Edges Experiment
Tests whether splitting graph extraction into "local drafts entity candidates,
API verifies + draws edges" reduces total API cost vs single-shot full
extraction, while producing a comparable graph.
Two conditions per document:
A — Baseline: single Claude Haiku call, full extraction
B — Cascade: Mistral lists entity candidates, then Haiku does verify+edges
Both conditions:
- See the full document (parity-respecting)
- Use open entity type vocabulary (no fixed schema)
- Use natural-language predicates (no constrained relations)
- Same target output schema, same temperature
Sample: 20 docs from briefing_test_v2_results.json, stratified by char length.
Reports API cost only. Local Mistral time is recorded but not monetized
(ran on the VPS, no per-token API charge).
Outputs: ~/aaronai/experiments/cascade_test_results.json
"""
import json
import os
import re
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_test_results.json"
HAIKU_MODEL = "claude-haiku-4-5-20251001"
HAIKU_MAX_TOKENS = 4096
HAIKU_TEMPERATURE = 0.0
OLLAMA_URL = "http://localhost:11434/api/generate"
LOCAL_MODEL = "mistral"
LOCAL_TIMEOUT = 120
MAX_DOC_CHARS = 8000
# Verified pricing 2026-04-28 against Anthropic docs
HAIKU_IN_PER_M = 1.0
HAIKU_OUT_PER_M = 5.0
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
DOCUMENT:
"""
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
Return ONLY valid JSON:
{
"candidates": [string]
}
Just names. No types, no relationships. JSON only.
DOCUMENT:
"""
CONDITION_B_API_PROMPT_WITH_DRAFT = """Extract a knowledge graph from the document below.
A local model has identified entity candidates that may help orient your reading. Treat the candidates as a hint, not as truth — verify each candidate appears in the document, ignore any that do not, and add any entities the candidates missed.
Return ONLY valid JSON with this exact schema:
{
"entities": [
{"name": string, "type": string}
],
"edges": [
{"subject": string, "predicate": string, "object": string}
]
}
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
ENTITY CANDIDATES FROM LOCAL MODEL:
{local_draft}
DOCUMENT:
"""
def strip_json_fences(text):
if not text:
return ""
t = text.strip()
t = re.sub(r"^```(?:json)?\s*", "", t)
t = re.sub(r"\s*```$", "", t)
return t.strip()
def fetch_document_text(pg_conn, source):
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None, 0
full = "\n\n".join(r[0] for r in rows)
return full[:MAX_DOC_CHARS], len(full)
def call_haiku(client, prompt_text):
t0 = time.time()
resp = client.messages.create(
model=HAIKU_MODEL,
max_tokens=HAIKU_MAX_TOKENS,
temperature=HAIKU_TEMPERATURE,
messages=[{"role": "user", "content": prompt_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def call_local(document_text):
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": LOCAL_MODEL,
"prompt": LOCAL_PROMPT + document_text,
"stream": False,
"format": "json",
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 8192},
},
timeout=LOCAL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_graph(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None, None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None, None
if not isinstance(data, dict):
return None, None
ents = data.get("entities")
edges = data.get("edges")
if isinstance(ents, list) and isinstance(edges, list):
return len(ents), len(edges)
return None, None
def parse_candidates(raw):
cleaned = strip_json_fences(raw)
if not cleaned:
return None
try:
data = json.loads(cleaned)
except json.JSONDecodeError:
return None
if not isinstance(data, dict):
return None
cands = data.get("candidates")
if isinstance(cands, list):
return [str(c).strip() for c in cands if c]
return None
def stratify(docs):
"""Pick 5 small / 10 medium / 5 large by character length, in file order."""
sized = [(d, d["content_length"]) for d in docs]
small = [d for d, n in sized if n < 1000]
medium = [d for d, n in sized if 1000 <= n < 5000]
large = [d for d, n in sized if n >= 5000]
return small[:5] + medium[:10] + large[:5]
def main():
api_key = os.environ.get("ANTHROPIC_API_KEY")
pg_dsn = os.environ.get("PG_DSN")
if not api_key or not pg_dsn:
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
sys.exit(1)
if not V2_FILE.exists():
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
sys.exit(1)
with open(V2_FILE) as f:
v2 = json.load(f)
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
sample = stratify(docs_meta)
print(f"Sample: {len(sample)} docs (stratified by char length, file order)")
for d in sample:
print(f" [{d['content_length']:>6}c] {d['source'][:60]}")
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
print(f"Local model: {LOCAL_MODEL}")
print()
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc_meta in enumerate(sample, 1):
source = doc_meta["source"]
doc_text, original_len = fetch_document_text(pg_conn, source)
if not doc_text:
print(f"[{i:02d}/{len(sample)}] {source[:60]} — SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
sent_len = len(doc_text)
truncated = original_len > sent_len
size_bucket = (
"small" if sent_len < 1000
else "medium" if sent_len < 5000
else "large"
)
trunc_marker = "*" if truncated else " "
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
# Condition A
try:
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
a_ents, a_edges = parse_graph(a["response_text"])
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
flush=True)
except Exception as e:
print(f" A FAILED: {e}", flush=True)
a = {"error": str(e)}
a_ents = a_edges = None
# Condition B local pass
local_result = call_local(doc_text)
if "error" in local_result:
print(f" B local FAILED: {local_result['error']} — skipping doc", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_model_failed",
"local_error": local_result["error"],
"local_latency_s": local_result.get("latency_s"),
},
})
continue
local_raw = local_result["response"]
cands = parse_candidates(local_raw)
local_candidates = cands or []
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
flush=True)
if not local_candidates:
print(f" B local: empty draft — skipping API call to avoid asymmetric test", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"skipped": "local_draft_empty",
"local_latency_s": local_result.get("latency_s"),
"local_raw": local_raw[:1000],
},
})
continue
local_draft_str = "\n".join(f"- {c}" for c in local_candidates)
b_prompt = CONDITION_B_API_PROMPT_WITH_DRAFT.replace("{local_draft}", local_draft_str) + doc_text
try:
b = call_haiku(client, b_prompt)
b_ents, b_edges = parse_graph(b["response_text"])
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
flush=True)
except Exception as e:
print(f" B api FAILED: {e}", flush=True)
b = {"error": str(e)}
b_ents = b_edges = None
if "input_tokens" in a and "input_tokens" in b:
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
edge_pct_str = "n/a"
if a_edges and b_edges is not None and a_edges > 0:
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
results.append({
"source": source,
"size_bucket": size_bucket,
"doc_chars_original": original_len,
"doc_chars_sent": sent_len,
"truncated": truncated,
"condition_a": {
"input_tokens": a.get("input_tokens"),
"output_tokens": a.get("output_tokens"),
"latency_s": a.get("latency_s"),
"entity_count": a_ents,
"edge_count": a_edges,
"stop_reason": a.get("stop_reason"),
"response_text": a.get("response_text", "")[:4000],
"error": a.get("error"),
},
"condition_b": {
"local_latency_s": local_result.get("latency_s"),
"local_candidates": local_candidates,
"local_raw": local_raw[:1000],
"api_input_tokens": b.get("input_tokens"),
"api_output_tokens": b.get("output_tokens"),
"api_latency_s": b.get("latency_s"),
"entity_count": b_ents,
"edge_count": b_edges,
"stop_reason": b.get("stop_reason"),
"response_text": b.get("response_text", "")[:4000],
"error": b.get("error"),
},
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results
if r.get("condition_a", {}).get("input_tokens") is not None
and r.get("condition_b", {}).get("api_input_tokens") is not None]
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
by_bucket = {}
for bucket in ("small", "medium", "large"):
rows = [r for r in valid if r["size_bucket"] == bucket]
if not rows:
by_bucket[bucket] = None
continue
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
by_bucket[bucket] = {
"n": len(rows),
"a_input_tokens": ai,
"a_output_tokens": ao,
"b_input_tokens": bi,
"b_output_tokens": bo,
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
}
summary = {
"experiment": "cascade_test",
"title": "Nodes-vs-Edges Cascade Experiment",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"haiku_model": HAIKU_MODEL,
"haiku_temperature": HAIKU_TEMPERATURE,
"haiku_max_tokens": HAIKU_MAX_TOKENS,
"local_model": LOCAL_MODEL,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(sample),
"n_valid_pairs": len(valid),
"n_skipped": len(sample) - len(valid),
"total_elapsed_s": total_elapsed,
"totals": {
"a_input_tokens": a_in,
"a_output_tokens": a_out,
"b_input_tokens": b_in,
"b_output_tokens": b_out,
"a_cost_usd": round(a_cost, 4),
"b_cost_usd": round(b_cost, 4),
"cost_delta_usd": round(b_cost - a_cost, 4),
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
"note": "API cost only — local Mistral runtime on VPS not monetized",
},
"by_size_bucket": by_bucket,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
delta_pct = summary['totals']['cost_delta_pct']
if delta_pct is not None:
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
print()
print("By size bucket:")
for bucket, stats in by_bucket.items():
if stats:
print(f" {bucket:6s} (n={stats['n']}): "
f"in {stats['input_delta_pct']:+.1f}% "
f"out {stats['output_delta_pct']:+.1f}% "
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
print()
print(f"NOTE: API cost only. Local Mistral runtime is not monetized.")
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
+248
View File
@@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
BirdAI Cascaded Extraction — Consistency Test
"""
import json
import os
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/consistency_test_results.json")
MODEL = "mistral"
PASSES = 3
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
EXTRACTION_PROMPT = """Extract named entities from this text. Return JSON only, no explanation, no prose.
Use exactly these fields (omit any field you are uncertain about, use empty list if none found):
{
"people": [],
"organizations": [],
"locations": [],
"dates": [],
"document_type": ""
}
Rules:
- Every value in people, organizations, locations, dates must be a plain string
- document_type must be a plain string
- No nested objects, no nested lists
- Only include entities you are certain about
- If uncertain about anything, omit it
Text: """
def get_sample_documents():
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_extraction(text):
prompt = EXTRACTION_PROMPT + text[:1500]
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False
}).encode()
try:
req = urllib.request.Request(
OLLAMA_URL,
data=payload,
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:100]}"
json_str = raw[start:end]
parsed = json.loads(json_str)
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {json_str[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def flatten_value(v):
if isinstance(v, str):
return v.lower().strip()
elif isinstance(v, dict):
return json.dumps(v, sort_keys=True).lower()
elif isinstance(v, list):
return json.dumps(sorted([flatten_value(i) for i in v]))
else:
return str(v).lower().strip()
def normalize_extraction(extracted):
if extracted is None:
return None
normalized = {}
expected_fields = ["people", "organizations", "locations", "dates", "document_type"]
for key in expected_fields:
val = extracted.get(key, [] if key != "document_type" else "")
if isinstance(val, list):
normalized[key] = sorted([flatten_value(v) for v in val])
else:
normalized[key] = flatten_value(val)
return normalized
def extractions_consistent(extractions):
if any(e is None for e in extractions):
return False
normalized = [normalize_extraction(e) for e in extractions]
if any(n is None for n in normalized):
return False
return all(n == normalized[0] for n in normalized[1:])
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
print(f"\nBirdAI Consistency Test")
print(f"Model: {MODEL} | Passes: {PASSES} | Sample: {SAMPLE_SIZE} docs")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 60)
docs = get_sample_documents()
print(f"Loaded {len(docs)} documents from pgvector\n")
results = {
"meta": {
"model": MODEL,
"passes": PASSES,
"sample_size": len(docs),
"started": datetime.now().isoformat(),
"completed": None
},
"documents": [],
"summary": {}
}
consistent_count = 0
failed_count = 0
timeout_count = 0
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
print(f"[{i+1:02d}/{len(docs)}] {source[:50]:<50} hash:{chash}", end=" ", flush=True)
passes = []
pass_times = []
raw_outputs = []
for p in range(PASSES):
t_start = time.time()
extracted, raw = run_extraction(content)
t_end = time.time()
passes.append(extracted)
pass_times.append(round(t_end - t_start, 1))
raw_outputs.append(raw[:200] if raw else "")
consistent = extractions_consistent(passes)
any_timeout = any("TIMEOUT" in str(r) for r in raw_outputs)
any_failed = any(p is None for p in passes)
if any_timeout:
timeout_count += 1
status = "TIMEOUT"
elif any_failed:
failed_count += 1
status = "FAILED"
elif consistent:
consistent_count += 1
status = "CONSISTENT"
else:
status = "INCONSISTENT"
print(f"{status} ({'/'.join(str(t) for t in pass_times)}s)")
try:
sample_extraction = normalize_extraction(passes[0]) if passes[0] else None
except Exception:
sample_extraction = None
results["documents"].append({
"id": doc_id,
"source": source,
"content_hash": chash,
"content_length": len(content),
"status": status,
"consistent": consistent,
"pass_times_seconds": pass_times,
"extraction_sample": sample_extraction,
"raw_samples": raw_outputs
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total = len(docs)
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
summary = {
"total": total,
"consistent": consistent_count,
"inconsistent": total - consistent_count - failed_count - timeout_count,
"failed": failed_count,
"timeout": timeout_count,
"consistency_rate": round(consistent_count / total * 100, 1),
"cascade_viable": consistent_count / total >= 0.5
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 60)
print(f"RESULTS")
print(f" Consistent: {consistent_count}/{total} ({summary['consistency_rate']}%)")
print(f" Inconsistent: {summary['inconsistent']}")
print(f" Failed/Timeout: {failed_count + timeout_count}")
print(f" Cascade viable: {'YES' if summary['cascade_viable'] else 'NO — reconsider architecture'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 60)
if __name__ == "__main__":
main()
+230
View File
@@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Experiment 003 — Entity-Only Consistency Test
Three Mistral passes per document, measure consistency on entity fields only
(people, organizations, locations, dates). Excludes document_type label.
DISTINCT ON (source) sampling — fixes Exp 001 chunk-replacement flaw.
Outputs: ~/aaronai/experiments/consistency_test_v2_results.json
"""
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import psycopg2
import requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "consistency_test_v2_results.json"
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "mistral"
N_PASSES = 3
N_DOCS = 50
PER_CALL_TIMEOUT = 60 # seconds — fail fast, don't wedge
MAX_DOC_CHARS = 8000 # cap document length sent to Mistral
EXTRACTION_PROMPT = """Extract entities from the document below. Return ONLY valid JSON with this exact schema:
{
"people": [string],
"organizations": [string],
"locations": [string],
"dates": [string]
}
Rules:
- Only include entities you are CERTAIN about. If uncertain, omit.
- No prose, no markdown fences, no commentary. JSON only.
- Empty arrays are valid.
DOCUMENT:
"""
def call_mistral(document_text):
truncated = document_text[:MAX_DOC_CHARS]
t0 = time.time()
try:
resp = requests.post(
OLLAMA_URL,
json={
"model": MODEL,
"prompt": EXTRACTION_PROMPT + truncated,
"stream": False,
"format": "json",
"options": {"num_predict": 512},
},
timeout=PER_CALL_TIMEOUT,
)
resp.raise_for_status()
return {
"response": resp.json().get("response", ""),
"latency_s": round(time.time() - t0, 2),
"truncated": len(document_text) > MAX_DOC_CHARS,
}
except requests.exceptions.Timeout:
return {"error": f"timeout after {PER_CALL_TIMEOUT}s", "latency_s": PER_CALL_TIMEOUT}
except Exception as e:
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
def parse_entities(raw_response):
text = (raw_response or "").strip()
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
try:
data = json.loads(text)
except json.JSONDecodeError:
return None
out = {}
for key in ("people", "organizations", "locations", "dates"):
vals = data.get(key, [])
if not isinstance(vals, list):
return None
out[key] = sorted(set(str(v).strip().lower() for v in vals if v))
return out
def entities_match(a, b):
if a is None or b is None:
return False
return all(a[k] == b[k] for k in ("people", "organizations", "locations", "dates"))
def fetch_distinct_sources(pg_conn, n):
cur = pg_conn.cursor()
cur.execute("""
SELECT source, string_agg(document, E'\n\n' ORDER BY id) AS doc
FROM embeddings
WHERE source IS NOT NULL
GROUP BY source
ORDER BY MIN(id)
LIMIT %s
""", (n,))
rows = cur.fetchall()
cur.close()
return [(s, d) for s, d in rows if d and len(d.strip()) > 50]
def main():
pg_dsn = os.environ.get("PG_DSN")
if not pg_dsn:
print("ERROR: PG_DSN not set", file=sys.stderr)
sys.exit(1)
pg_conn = psycopg2.connect(pg_dsn)
docs = fetch_distinct_sources(pg_conn, N_DOCS)
pg_conn.close()
print(f"Loaded {len(docs)} distinct sources from pgvector")
print(f"Model: {MODEL} | Passes per doc: {N_PASSES}")
print(f"Per-call timeout: {PER_CALL_TIMEOUT}s | Max doc chars: {MAX_DOC_CHARS}")
print(f"Calls planned: {len(docs) * N_PASSES}\n")
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, (source, doc_text) in enumerate(docs, 1):
size_marker = f"[{len(doc_text):>5}c]"
print(f"[{i:02d}/{len(docs)}] {size_marker} {source[:55]}", flush=True)
passes = []
for p in range(N_PASSES):
r = call_mistral(doc_text)
if "error" in r:
print(f" pass {p+1}: {r['error']}", flush=True)
passes.append({"error": r["error"], "parsed_ok": False, "latency_s": r["latency_s"]})
else:
entities = parse_entities(r["response"])
passes.append({
"raw": r["response"][:500],
"entities": entities,
"latency_s": r["latency_s"],
"parsed_ok": entities is not None,
"truncated_input": r.get("truncated", False),
})
all_parsed = all(p.get("parsed_ok") for p in passes)
if all_parsed:
e1, e2, e3 = passes[0]["entities"], passes[1]["entities"], passes[2]["entities"]
consistent = entities_match(e1, e2) and entities_match(e2, e3)
per_field = {
k: (e1[k] == e2[k] == e3[k])
for k in ("people", "organizations", "locations", "dates")
}
else:
consistent = False
per_field = None
latencies = [p.get("latency_s", 0) for p in passes]
print(f" parsed={all_parsed} consistent={consistent} latencies={latencies}", flush=True)
results.append({
"source": source,
"doc_chars": len(doc_text),
"passes": passes,
"all_parsed": all_parsed,
"consistent": consistent,
"per_field_consistency": per_field,
})
total_elapsed = round(time.time() - t_total, 1)
parsed = [r for r in results if r["all_parsed"]]
consistent = [r for r in parsed if r["consistent"]]
field_rates = {k: 0 for k in ("people", "organizations", "locations", "dates")}
for r in parsed:
for k, v in (r["per_field_consistency"] or {}).items():
if v:
field_rates[k] += 1
field_rates_pct = {
k: round(100 * v / len(parsed), 1) if parsed else 0.0
for k, v in field_rates.items()
}
summary = {
"experiment": "003",
"title": "Entity-Only Consistency Test",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"model": MODEL,
"n_passes": N_PASSES,
"per_call_timeout_s": PER_CALL_TIMEOUT,
"max_doc_chars": MAX_DOC_CHARS,
"n_documents": len(docs),
"n_all_parsed": len(parsed),
"n_fully_consistent": len(consistent),
"consistency_rate_pct": round(100 * len(consistent) / len(docs), 2) if docs else 0.0,
"consistency_rate_among_parsed_pct": (
round(100 * len(consistent) / len(parsed), 2) if parsed else 0.0
),
"per_field_consistency_pct": field_rates_pct,
"total_elapsed_s": total_elapsed,
"exp_001_baseline_pct": 18.0,
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(docs)} docs in {total_elapsed}s")
print(f"All 3 passes parsed cleanly: {len(parsed)}/{len(docs)}")
print(f"Fully consistent (all 4 fields match): {len(consistent)}/{len(docs)} ({summary['consistency_rate_pct']}%)")
print(f"Among parsed only: {summary['consistency_rate_among_parsed_pct']}%")
print(f"Per-field consistency: {field_rates_pct}")
print(f"Exp 001 baseline: 18% | delta: {summary['consistency_rate_pct'] - 18.0:+.2f} pts")
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()
@@ -0,0 +1,179 @@
"""
Measure actual Graphiti BULK episode cost on a stratified sample.
Uses /episodes/bulk endpoint. Submits in small batches to avoid rate limits.
"""
import json, os, random, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
SAMPLE_SIZE = 50
BATCH_SIZE = 5
RANDOM_SEED = 42
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
OUT.parent.mkdir(parents=True, exist_ok=True)
def fetch_stratified_sample():
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("""
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
FROM embeddings
GROUP BY source
""")
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
cur.close(); conn.close()
random.seed(RANDOM_SEED)
short = [(s, d) for s, d in sources if len(d) < 1000]
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
sample = (
random.sample(short, min(15, len(short))) +
random.sample(medium, min(25, len(medium))) +
random.sample(long_, min(10, len(long_)))
)
print(f"Sample: {len(sample)} sources, batch_size={BATCH_SIZE}")
return sample
def submit_bulk_batch(batch):
payload = {
"episodes": [
{
"name": source,
"content": doc[:12000],
"source_description": "pgvector_migration_bulk_test",
"timestamp": "2026-04-28T00:00:00",
}
for source, doc in batch
]
}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
elapsed = time.time() - t0
return {
"batch_size": len(batch),
"status_code": r.status_code,
"elapsed_s": round(elapsed, 2),
"elapsed_per_episode_s": round(elapsed / len(batch), 2),
"response": r.json() if r.ok else None,
"error": None if r.ok else r.text[:500],
"sources": [s for s, _ in batch],
}
except Exception as e:
return {
"batch_size": len(batch),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"elapsed_per_episode_s": None,
"response": None,
"error": str(e)[:500],
"sources": [s for s, _ in batch],
}
def main():
print("=" * 60)
print("Graphiti BULK Migration Cost Test (Haiku 4.5)")
print("=" * 60)
print()
print("BEFORE running:")
print(" 1. Open https://console.anthropic.com/settings/usage")
print(" 2. Note current spend.")
print()
input("Press Enter when noted... ")
print()
sample = fetch_stratified_sample()
if not sample:
print("ERROR: empty sample"); return
batches = [sample[i:i+BATCH_SIZE] for i in range(0, len(sample), BATCH_SIZE)]
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE} episodes")
print()
results = []
total_start = time.time()
for i, batch in enumerate(batches, start=1):
avg_chars = int(sum(len(d) for _, d in batch) / len(batch))
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg_chars:6d}",
end=" ", flush=True)
result = submit_bulk_batch(batch)
results.append(result)
if result["error"]:
print(f" ERROR: {result['error'][:80]}")
if "429" in (result["error"] or "") or "rate" in (result["error"] or "").lower():
print(" Rate limited - pausing 30s before next batch")
time.sleep(30)
else:
print(f" {result['status_code']} {result['elapsed_s']}s "
f"({result['elapsed_per_episode_s']}s/episode)")
total_elapsed = time.time() - total_start
successful_batches = [r for r in results if r["error"] is None]
failed_batches = [r for r in results if r["error"] is not None]
successful_episodes = sum(r["batch_size"] for r in successful_batches)
failed_episodes = sum(r["batch_size"] for r in failed_batches)
summary = {
"sample_size": len(sample),
"batch_size": BATCH_SIZE,
"n_batches": len(batches),
"successful_batches": len(successful_batches),
"failed_batches": len(failed_batches),
"successful_episodes": successful_episodes,
"failed_episodes": failed_episodes,
"total_elapsed_s": round(total_elapsed, 1),
"mean_elapsed_per_episode_s": round(
sum(r["elapsed_s"] for r in successful_batches) /
max(successful_episodes, 1), 2
) if successful_episodes else None,
"results": results,
}
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
total_sources = cur.fetchone()[0]
cur.close(); conn.close()
summary["total_corpus_sources"] = total_sources
if summary["mean_elapsed_per_episode_s"]:
summary["estimated_migration_hours"] = round(
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
)
OUT.write_text(json.dumps(summary, indent=2))
print()
print("=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Episodes: {summary['successful_episodes']}/{summary['sample_size']} succeeded")
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
print(f"Total elapsed: {summary['total_elapsed_s']}s")
if summary["mean_elapsed_per_episode_s"]:
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
print(f"Total corpus sources: {summary['total_corpus_sources']}")
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
print()
print(f"AFTER:")
print(f" Wait 5 min; note new Anthropic spend; subtract from $28.61 baseline.")
print(f" delta / {summary['successful_episodes']} = per-episode cost")
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
print()
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
@@ -0,0 +1,122 @@
"""
Retest just the previously-failed batches after raising MAX_QUEUED_QUERIES.
Reads failed sources from graphiti_bulk_cost_test.json and resubmits.
"""
import json, os, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
BATCH_SIZE = 5
PRIOR_RESULTS = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
def fetch_doc_for_source(cur, source):
cur.execute("""
SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id)
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
return row[0] if row else None
def submit_bulk_batch(batch):
payload = {"episodes": [
{"name": s, "content": d[:12000],
"source_description": "pgvector_migration_bulk_retry",
"timestamp": "2026-04-28T00:00:00"}
for s, d in batch
]}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
return {
"batch_size": len(batch),
"status_code": r.status_code,
"elapsed_s": round(time.time() - t0, 2),
"elapsed_per_episode_s": round((time.time() - t0) / len(batch), 2),
"error": None if r.ok else r.text[:500],
"sources": [s for s, _ in batch],
}
except Exception as e:
return {
"batch_size": len(batch),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"elapsed_per_episode_s": None,
"error": str(e)[:500],
"sources": [s for s, _ in batch],
}
def main():
prior = json.loads(PRIOR_RESULTS.read_text())
failed_sources = []
for batch_result in prior["results"]:
if batch_result["error"] is not None:
failed_sources.extend(batch_result["sources"])
print(f"Retrying {len(failed_sources)} previously-failed sources")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
sources_with_docs = []
for s in failed_sources:
doc = fetch_doc_for_source(cur, s)
if doc:
sources_with_docs.append((s, doc))
else:
print(f" WARN: could not find doc for source {s}")
cur.close(); conn.close()
print(f"Loaded {len(sources_with_docs)} source docs")
print()
batches = [sources_with_docs[i:i+BATCH_SIZE]
for i in range(0, len(sources_with_docs), BATCH_SIZE)]
results = []
total_start = time.time()
for i, batch in enumerate(batches, start=1):
avg = int(sum(len(d) for _, d in batch) / len(batch))
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}",
end=" ", flush=True)
result = submit_bulk_batch(batch)
results.append(result)
if result["error"]:
print(f" ERROR: {result['error'][:80]}")
else:
print(f" {result['status_code']} {result['elapsed_s']}s")
total_elapsed = time.time() - total_start
successful = [r for r in results if r["error"] is None]
failed = [r for r in results if r["error"] is not None]
summary = {
"n_retry_sources": len(sources_with_docs),
"n_batches": len(batches),
"successful_batches": len(successful),
"failed_batches": len(failed),
"successful_episodes": sum(r["batch_size"] for r in successful),
"failed_episodes": sum(r["batch_size"] for r in failed),
"total_elapsed_s": round(total_elapsed, 1),
"results": results,
}
OUT.write_text(json.dumps(summary, indent=2))
print()
print("=" * 60)
print("RETRY RESULTS")
print("=" * 60)
print(f"Episodes: {summary['successful_episodes']}/{len(sources_with_docs)} succeeded")
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
print(f"Total elapsed: {summary['total_elapsed_s']}s")
print()
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
@@ -0,0 +1,93 @@
"""Retry attempt #2 — for sources that timed out after MAX_QUEUED_QUERIES bump."""
import json, os, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
BATCH_SIZE = 3 # smaller batches given timeouts
PRIOR = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry2.json"
def fetch_doc(cur, source):
cur.execute("SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id) FROM embeddings WHERE source = %s", (source,))
row = cur.fetchone()
return row[0] if row else None
def submit_batch(batch):
payload = {"episodes": [
{"name": s, "content": d[:12000],
"source_description": "pgvector_migration_bulk_retry2",
"timestamp": "2026-04-28T00:00:00"}
for s, d in batch
]}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
return {
"batch_size": len(batch),
"status_code": r.status_code,
"elapsed_s": round(time.time() - t0, 2),
"error": None if r.ok else r.text[:500],
"sources": [s for s, _ in batch],
}
except Exception as e:
return {
"batch_size": len(batch),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"error": str(e)[:500],
"sources": [s for s, _ in batch],
}
def main():
prior = json.loads(PRIOR.read_text())
failed = []
for r in prior["results"]:
if r["error"] is not None:
failed.extend(r["sources"])
print(f"Retry #2: {len(failed)} sources still failing")
conn = psycopg2.connect(PG_DSN); cur = conn.cursor()
sources = []
for s in failed:
d = fetch_doc(cur, s)
if d: sources.append((s, d))
cur.close(); conn.close()
batches = [sources[i:i+BATCH_SIZE] for i in range(0, len(sources), BATCH_SIZE)]
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE}\n")
results = []
for i, batch in enumerate(batches, 1):
avg = int(sum(len(d) for _, d in batch) / len(batch))
print(f"[batch {i}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}", end=" ", flush=True)
r = submit_batch(batch)
results.append(r)
if r["error"]: print(f" ERROR: {r['error'][:80]}")
else: print(f" {r['status_code']} {r['elapsed_s']}s")
succ = [r for r in results if r["error"] is None]
fail = [r for r in results if r["error"] is not None]
summary = {
"n_sources": len(sources),
"successful_batches": len(succ),
"failed_batches": len(fail),
"successful_episodes": sum(r["batch_size"] for r in succ),
"failed_episodes": sum(r["batch_size"] for r in fail),
"results": results,
}
OUT.write_text(json.dumps(summary, indent=2))
print()
print(f"Episodes: {summary['successful_episodes']}/{len(sources)} succeeded")
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
@@ -0,0 +1,175 @@
"""
Measure actual Graphiti episode-add cost on a stratified sample of pgvector sources.
"""
import json, os, random, time
from pathlib import Path
import psycopg2, requests
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.environ["PG_DSN"]
SAMPLE_SIZE = 50
RANDOM_SEED = 42
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_cost_test.json"
OUT.parent.mkdir(parents=True, exist_ok=True)
def fetch_stratified_sample():
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("""
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
FROM embeddings
GROUP BY source
""")
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
cur.close(); conn.close()
random.seed(RANDOM_SEED)
short = [(s, d) for s, d in sources if len(d) < 1000]
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
sample = (
random.sample(short, min(15, len(short))) +
random.sample(medium, min(25, len(medium))) +
random.sample(long_, min(10, len(long_)))
)
print(f"Sample: {len(sample)} sources")
return sample
def submit_episode(source: str, document: str) -> dict:
payload = {
"name": source,
"content": document[:12000],
"source_description": "pgvector_migration_cost_test",
"timestamp": "2026-04-28T00:00:00",
}
t0 = time.time()
try:
r = requests.post(f"{GRAPHITI_URL}/episodes", json=payload, timeout=600)
return {
"source": source,
"doc_chars": len(document),
"doc_chars_sent": min(len(document), 12000),
"status_code": r.status_code,
"elapsed_s": round(time.time() - t0, 2),
"error": None if r.ok else r.text[:500],
}
except Exception as e:
return {
"source": source,
"doc_chars": len(document),
"doc_chars_sent": min(len(document), 12000),
"status_code": None,
"elapsed_s": round(time.time() - t0, 2),
"error": str(e)[:500],
}
def main():
print("=" * 60)
print("Graphiti Migration Cost Test (Haiku 4.5)")
print("=" * 60)
print()
print("BEFORE running:")
print(" 1. Open https://console.anthropic.com/settings/usage")
print(" 2. Note current spend.")
print()
input("Press Enter when noted... ")
print()
sample = fetch_stratified_sample()
if not sample:
print("ERROR: empty sample"); return
# Smoke test
print(f"Smoke test on first source ({sample[0][0][:50]}...):")
smoke = submit_episode(*sample[0])
print(f" status={smoke['status_code']} elapsed={smoke['elapsed_s']}s")
if smoke["error"]:
print(f" ERROR: {smoke['error']}")
OUT.write_text(json.dumps({"smoke_test": smoke}, indent=2))
print("Halted — fix smoke test before bulk run.")
return
print(f" OK. Proceeding with {len(sample)} sources.")
print()
results = [smoke]
total_start = time.time()
for i, (source, doc) in enumerate(sample[1:], start=2):
bucket = "short" if len(doc) < 1000 else "medium" if len(doc) < 5000 else "long"
print(f"[{i:2d}/{len(sample)}] [{bucket:6s}] [{len(doc):6d}c] {source[:50]:50s}", end=" ", flush=True)
result = submit_episode(source, doc)
results.append(result)
if result["error"]:
print(f" ERROR: {result['error'][:80]}")
else:
print(f" {result['status_code']} {result['elapsed_s']}s")
total_elapsed = time.time() - total_start
successful = [r for r in results if r["error"] is None]
failed = [r for r in results if r["error"] is not None]
summary = {
"sample_size": len(sample),
"successful": len(successful),
"failed": len(failed),
"total_elapsed_s": round(total_elapsed, 1),
"mean_elapsed_per_episode_s": round(
sum(r["elapsed_s"] for r in successful) / max(len(successful), 1), 2
),
"by_bucket": {},
"results": results,
}
for bname, lo, hi in [("short", 0, 1000), ("medium", 1000, 5000), ("long", 5000, 10**9)]:
b = [r for r in successful if lo <= r["doc_chars"] < hi]
if b:
summary["by_bucket"][bname] = {
"n": len(b),
"mean_elapsed_s": round(sum(r["elapsed_s"] for r in b) / len(b), 2),
"mean_chars": int(sum(r["doc_chars"] for r in b) / len(b)),
}
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor()
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
total_sources = cur.fetchone()[0]
cur.close(); conn.close()
summary["total_corpus_sources"] = total_sources
summary["estimated_migration_hours"] = round(
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
)
OUT.write_text(json.dumps(summary, indent=2))
print()
print("=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Sample: {summary['successful']}/{summary['sample_size']} succeeded, {summary['failed']} failed")
print(f"Total elapsed: {summary['total_elapsed_s']}s")
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
for bucket, stats in summary["by_bucket"].items():
print(f" {bucket:6s} n={stats['n']:3d} chars~{stats['mean_chars']:6d} elapsed~{stats['mean_elapsed_s']}s")
print()
print(f"Total corpus sources: {summary['total_corpus_sources']}")
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
print()
print("AFTER:")
print(" Wait 5 min; note new Anthropic spend; subtract.")
print(f" test_cost / {summary['successful']} = per-episode cost")
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
print()
print(f"Full results: {OUT}")
if __name__ == "__main__":
main()
@@ -0,0 +1,155 @@
"""
E1.4 per-source predicate diversity comparison — fixed version.
Looks up episode uuids by name in both production and cascade graphs.
"""
import json
from collections import defaultdict
from falkordb import FalkorDB
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
PRODUCTION_GROUP = "aaron"
CASCADE_GROUP = "aaron_cascade_e14"
def get_predicates_for_episode(graph, episode_uuid):
query = """
MATCH ()-[r:RELATES_TO]->()
WHERE $uuid IN r.episodes
RETURN count(DISTINCT r.name) AS predicate_count
"""
result = graph.query(query, {"uuid": episode_uuid})
rows = result.result_set
return rows[0][0] if rows else 0
def get_edge_count_for_episode(graph, episode_uuid):
query = """
MATCH ()-[r:RELATES_TO]->()
WHERE $uuid IN r.episodes
RETURN count(r) AS edge_count
"""
result = graph.query(query, {"uuid": episode_uuid})
rows = result.result_set
return rows[0][0] if rows else 0
def find_episode_uuid(graph, source_name):
query = """
MATCH (e:Episodic {name: $name})
RETURN e.uuid AS uuid
LIMIT 1
"""
result = graph.query(query, {"name": source_name})
rows = result.result_set
return rows[0][0] if rows else None
def main():
db = FalkorDB(host='localhost', port=6379)
prod_graph = db.select_graph(PRODUCTION_GROUP)
cascade_graph = db.select_graph(CASCADE_GROUP)
with open(E14_RESULTS) as f:
e14 = json.load(f)
sources = [r for r in e14['results'] if 'submit_result' in r]
print(f"Analyzing {len(sources)} sources...")
print()
comparisons = []
missing_prod = 0
missing_cascade = 0
for src in sources:
name = src['name']
bucket = src['bucket']
prod_uuid = find_episode_uuid(prod_graph, name)
cascade_uuid = find_episode_uuid(cascade_graph, name)
if not prod_uuid:
missing_prod += 1
print(f" WARN: missing in production: {name}")
continue
if not cascade_uuid:
missing_cascade += 1
print(f" WARN: missing in cascade: {name}")
continue
prod_preds = get_predicates_for_episode(prod_graph, prod_uuid)
cascade_preds = get_predicates_for_episode(cascade_graph, cascade_uuid)
prod_edges = get_edge_count_for_episode(prod_graph, prod_uuid)
cascade_edges = get_edge_count_for_episode(cascade_graph, cascade_uuid)
comparisons.append({
"name": name,
"bucket": bucket,
"prod_preds": prod_preds,
"cascade_preds": cascade_preds,
"delta_preds": cascade_preds - prod_preds,
"prod_edges": prod_edges,
"cascade_edges": cascade_edges,
"delta_edges": cascade_edges - prod_edges,
})
if missing_prod or missing_cascade:
print()
print(f"Missing: {missing_prod} prod, {missing_cascade} cascade")
print()
if not comparisons:
print("No comparable sources found. Aborting.")
return
# Per-source detail
print(f"{'Bucket':<10} {'Source':<58} {'Preds A→B':<14} {'Δ':<6} {'Edges A→B':<14} {'Δ'}")
print("-" * 115)
for c in sorted(comparisons, key=lambda x: (x['bucket'], x['name'])):
name_short = (c['name'][:55] + '..') if len(c['name']) > 58 else c['name']
preds_str = f"{c['prod_preds']}{c['cascade_preds']}"
edges_str = f"{c['prod_edges']}{c['cascade_edges']}"
print(f"{c['bucket']:<10} {name_short:<58} {preds_str:<14} {c['delta_preds']:+d} {edges_str:<14} {c['delta_edges']:+d}")
# Per-bucket aggregation
print()
print("=" * 115)
print("PER-BUCKET AGGREGATION")
print("=" * 115)
by_bucket = defaultdict(list)
for c in comparisons:
by_bucket[c['bucket']].append(c)
for bucket in ['high', 'mid', 'low', 'document']:
items = by_bucket.get(bucket, [])
if not items:
continue
n = len(items)
sum_pp = sum(c['prod_preds'] for c in items)
sum_cp = sum(c['cascade_preds'] for c in items)
sum_pe = sum(c['prod_edges'] for c in items)
sum_ce = sum(c['cascade_edges'] for c in items)
positive = sum(1 for c in items if c['delta_preds'] > 0)
negative = sum(1 for c in items if c['delta_preds'] < 0)
flat = sum(1 for c in items if c['delta_preds'] == 0)
pct_pred = ((sum_cp - sum_pp) / sum_pp * 100) if sum_pp else 0
pct_edge = ((sum_ce - sum_pe) / sum_pe * 100) if sum_pe else 0
print(f"\n{bucket.upper()} (n={n}):")
print(f" Predicates: {sum_pp}{sum_cp} ({pct_pred:+.1f}%)")
print(f" Edges: {sum_pe}{sum_ce} ({pct_edge:+.1f}%)")
print(f" Outcomes: {positive} positive, {flat} flat, {negative} negative")
# Aggregate
print()
print("=" * 115)
print(f"AGGREGATE (n={len(comparisons)})")
print("=" * 115)
total_pp = sum(c['prod_preds'] for c in comparisons)
total_cp = sum(c['cascade_preds'] for c in comparisons)
total_pe = sum(c['prod_edges'] for c in comparisons)
total_ce = sum(c['cascade_edges'] for c in comparisons)
print(f" Predicates: {total_pp}{total_cp} ({(total_cp-total_pp)/total_pp*100:+.1f}%)")
print(f" Edges: {total_pe}{total_ce} ({(total_ce-total_pe)/total_pe*100:+.1f}%)")
out_path = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
with open(out_path, "w") as f:
json.dump(comparisons, f, indent=2)
print()
print(f"Saved to {out_path}")
if __name__ == "__main__":
main()
+208
View File
@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""E1.4 orchestration — cascade re-extraction at n=30, group_id=aaron_cascade_e14."""
import json
import os
import requests
import time
from pathlib import Path
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "e14_sample.json"
RESULTS_FILE = EXPERIMENTS / "e14_cascade_results.json"
PG_DSN = os.environ["PG_DSN"]
SIDECAR_URL = "http://localhost:8001"
TEST_GROUP_ID = "aaron_cascade_e14"
MAX_DOC_CHARS = 12000
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
def get_pg():
return psycopg2.connect(PG_DSN)
def fetch_source_text(source):
conn = get_pg()
cur = conn.cursor()
cur.execute("""
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
conn.close()
if row is None or row[0] is None:
return None
return row[0]
def run_mistral_metadata(text, max_retries=2):
truncated = text[:MAX_DOC_CHARS]
prompt = METADATA_PROMPT + truncated
last_err = None
for attempt in range(max_retries):
try:
response = requests.post(
"http://localhost:11434/api/generate",
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
timeout=300,
)
response.raise_for_status()
raw = response.json()["response"]
try:
metadata = json.loads(raw)
metadata["char_length"] = len(truncated)
return metadata
except json.JSONDecodeError:
return {"error": "JSON parse failed", "raw": raw[:500]}
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e:
last_err = e
if attempt < max_retries - 1:
print(f" (retry {attempt+1} after {type(e).__name__})", end=" ", flush=True)
time.sleep(5)
continue
return {"error": f"After {max_retries} retries: {last_err}"}
def format_metadata_as_orientation(metadata):
if "error" in metadata:
return None
summary = metadata.get("one_sentence_summary", "")
domain = metadata.get("domain_class", "unknown")
fmt = metadata.get("primary_format", "unknown")
return (
f"This is a {domain} document in {fmt} format. "
f"Summary: {summary} "
f"This metadata is provided to orient your extraction, not to constrain it. "
f"Extract entities and relationships freely from the document text itself; "
f"the metadata is descriptive context, not a checklist."
)
def submit_episode_singular(name, content, custom_instructions):
payload = {
"name": name,
"content": content[:MAX_DOC_CHARS],
"source_description": "e14_replication_run",
"timestamp": "2026-04-29T00:00:00",
"group_id": TEST_GROUP_ID,
"custom_extraction_instructions": custom_instructions,
}
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
response.raise_for_status()
return response.json()
def load_state():
if RESULTS_FILE.exists():
with open(RESULTS_FILE) as f:
data = json.load(f)
return data.get("results", []), {r["name"] for r in data.get("results", []) if "submit_result" in r}
return [], set()
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
results, completed = load_state()
if completed:
print(f"Resuming — {len(completed)} sources already completed, {len(selected) - len(completed)} remaining\n")
else:
print(f"E1.4 cascade replication — {len(selected)} episodes to group_id={TEST_GROUP_ID}\n")
for i, ep in enumerate(selected, 1):
name = ep["name"]
bucket = ep["bucket"]
if name in completed:
print(f"[{i}/{len(selected)}] [{bucket}] {name} — SKIP (already completed)")
continue
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
if ep.get("subtype"):
record["subtype"] = ep["subtype"]
print(f" Fetching source text...", end=" ", flush=True)
text = fetch_source_text(name)
if text is None:
print("FAILED — no chunks in pgvector")
record["error"] = "no source text"
results.append(record)
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
continue
record["doc_chars"] = len(text)
print(f"{len(text)} chars")
print(f" Generating Mistral metadata...", end=" ", flush=True)
t0 = time.time()
metadata = run_mistral_metadata(text)
elapsed = time.time() - t0
record["metadata"] = metadata
record["metadata_elapsed_s"] = round(elapsed, 1)
if "error" in metadata:
print(f"FAILED in {elapsed:.1f}s")
else:
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
custom_instructions = format_metadata_as_orientation(metadata)
record["custom_extraction_instructions"] = custom_instructions
print(f" Submitting via /episodes...", end=" ", flush=True)
t0 = time.time()
try:
result = submit_episode_singular(name, text, custom_instructions)
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — OK")
record["submit_elapsed_s"] = round(elapsed, 1)
record["submit_result"] = result
except Exception as e:
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — FAILED: {e}")
record["submit_error"] = str(e)
results.append(record)
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
print()
print(f"\nDone. Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()
+160
View File
@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
import json
import re
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
OUTPUT = EXPERIMENTS / "e14_sample.json"
TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}
def query_episode_counts():
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
"RETURN e.name AS name, count(distinct n) AS entities "
"ORDER BY entities DESC")
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
lines = [l for l in result.stdout.split("\n") if l.strip()]
episodes = []
i = 0
while i < len(lines):
if lines[i] == "name":
i += 2
continue
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
break
if i + 1 < len(lines):
try:
count = int(lines[i + 1])
episodes.append({"name": lines[i], "entities": count})
i += 2
except ValueError:
i += 1
else:
i += 1
return episodes
def is_document(name):
return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))
def doc_subtype(name):
"""Categorize document by likely subtype."""
s = name.lower()
if "syllabus" in s or "ind study" in s or "_is" in s:
return "academic"
if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
return "reference"
if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
return "reference"
if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
return "creative"
return "other"
def main():
print("Fetching episode entity counts from Tier 1 graph...")
episodes = query_episode_counts()
print(f"Got {len(episodes)} episodes")
# Load E1's sample to exclude
with open(E1_SAMPLE_FILE) as f:
e1_sample = json.load(f)
e1_names = {ep["name"] for ep in e1_sample["selected"]}
print(f"Excluding {len(e1_names)} sources from E1")
# Quartile boundaries
counts = sorted([e["entities"] for e in episodes], reverse=True)
n = len(counts)
top_q = counts[n // 4]
bottom_q = counts[3 * n // 4]
print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")
# Filter out E1 and bucket
available = [e for e in episodes if e["name"] not in e1_names]
high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]
print(f"\nAvailable after E1 exclusion:")
print(f" High-density: {len(high)}")
print(f" Mid-density: {len(mid)}")
print(f" Low-density: {len(low)}")
print(f" Documents: {len(docs)}")
# For high/mid/low: take from middle of bucket (avoids edge cases)
def pick(bucket, n):
if len(bucket) < n:
print(f" WARNING: only {len(bucket)} available, asked for {n}")
return bucket
mid_idx = len(bucket) // 2
start = max(0, mid_idx - n // 2)
return bucket[start:start + n]
selected = []
for ep in pick(high, TARGETS["high"]):
ep["bucket"] = "high"
selected.append(ep)
for ep in pick(mid, TARGETS["mid"]):
ep["bucket"] = "mid"
selected.append(ep)
for ep in pick(low, TARGETS["low"]):
ep["bucket"] = "low"
selected.append(ep)
# For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
doc_targets = {"academic": 2, "creative": 2, "reference": 2}
docs_by_subtype = {}
for ep in docs:
st = doc_subtype(ep["name"])
ep["subtype"] = st
docs_by_subtype.setdefault(st, []).append(ep)
print(f"\n Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")
# Pick from middle of each subtype bucket
for subtype, target in doc_targets.items():
sub_docs = docs_by_subtype.get(subtype, [])
picked = pick(sub_docs, target)
for ep in picked:
ep["bucket"] = "document"
selected.append(ep)
# If we're short on documents (e.g., subtype underrepresented), fill from "other"
doc_count = sum(1 for s in selected if s.get("bucket") == "document")
if doc_count < TARGETS["document"]:
shortage = TARGETS["document"] - doc_count
leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
for ep in leftover[:shortage]:
ep["bucket"] = "document"
ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
selected.append(ep)
print(f"\nSelected {len(selected)} episodes for E1.4:")
for ep in selected:
sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
print(f" [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e {ep['name']}")
with open(OUTPUT, "w") as f:
json.dump({
"metadata": {
"purpose": "E1.4 cascade re-extraction replication (n=30)",
"exclusions": "E1's 10 sources",
"stratification": {**TARGETS, "document_subtypes": doc_targets},
"quartile_top": top_q,
"quartile_bottom": bottom_q,
},
"selected": selected,
}, f, indent=2)
print(f"\nSaved to {OUTPUT}")
if __name__ == "__main__":
main()
+246
View File
@@ -0,0 +1,246 @@
"""
E1.6 analysis — correlate domain-purity ratings with cascade outcomes.
Applies pre-registered decision rules from E1.6 protocol.
"""
import json
from collections import defaultdict
RATINGS_PATH = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
COMPARISON_PATH = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
def spearman(xs, ys):
"""Compute Spearman rank correlation."""
n = len(xs)
if n < 2:
return None
# Rank the values
def rank(values):
sorted_idx = sorted(range(len(values)), key=lambda i: values[i])
ranks = [0] * len(values)
i = 0
while i < len(values):
j = i
while j + 1 < len(values) and values[sorted_idx[j+1]] == values[sorted_idx[i]]:
j += 1
avg_rank = (i + j) / 2 + 1
for k in range(i, j + 1):
ranks[sorted_idx[k]] = avg_rank
i = j + 1
return ranks
rx = rank(xs)
ry = rank(ys)
mean_rx = sum(rx) / n
mean_ry = sum(ry) / n
num = sum((rx[i] - mean_rx) * (ry[i] - mean_ry) for i in range(n))
den_x = (sum((rx[i] - mean_rx) ** 2 for i in range(n))) ** 0.5
den_y = (sum((ry[i] - mean_ry) ** 2 for i in range(n))) ** 0.5
if den_x == 0 or den_y == 0:
return None
return num / (den_x * den_y)
def main():
with open(RATINGS_PATH) as f:
ratings_data = json.load(f)
with open(COMPARISON_PATH) as f:
comparisons = json.load(f)
ratings_by_name = {r['name']: r for r in ratings_data['ratings']}
comp_by_name = {c['name']: c for c in comparisons}
# Join ratings with cascade outcomes
joined = []
for name, rating in ratings_by_name.items():
if name in comp_by_name:
comp = comp_by_name[name]
joined.append({
'name': name,
'binary': rating['binary'],
'score': rating['score'],
'note': rating.get('note'),
'bucket': comp['bucket'],
'delta_preds': comp['delta_preds'],
'delta_edges': comp['delta_edges'],
'prod_preds': comp['prod_preds'],
'cascade_preds': comp['cascade_preds'],
})
print("=" * 100)
print(f"E1.6 ANALYSIS — Domain Purity vs Cascade Outcome (n={len(joined)})")
print("=" * 100)
# Per-source detail with rating
print()
print(f"{'Bucket':<10} {'Source':<48} {'Domain':<8} {'Score':<6} {'Δpreds':<8} {'Δedges':<8}")
print("-" * 100)
for j in sorted(joined, key=lambda x: (x['binary'], -x['score'], x['bucket'], x['name'])):
name_short = (j['name'][:45] + '..') if len(j['name']) > 48 else j['name']
print(f"{j['bucket']:<10} {name_short:<48} {j['binary']:<8} {j['score']:<6} {j['delta_preds']:+d} {j['delta_edges']:+d}")
# PRIMARY TEST: binary purity vs cascade outcome distribution
print()
print("=" * 100)
print("PRIMARY TEST: Binary purity vs cascade outcome distribution")
print("=" * 100)
def categorize_outcome(delta):
if delta > 0:
return 'positive'
elif delta < 0:
return 'negative'
else:
return 'flat'
by_binary = defaultdict(lambda: {'positive': 0, 'flat': 0, 'negative': 0, 'total': 0})
for j in joined:
outcome = categorize_outcome(j['delta_preds'])
by_binary[j['binary']][outcome] += 1
by_binary[j['binary']]['total'] += 1
print()
print(f"{'Group':<15} {'n':<5} {'Positive':<12} {'Flat':<10} {'Negative':<12}")
print("-" * 60)
for binary in ['single', 'multi']:
d = by_binary[binary]
n = d['total']
if n == 0:
continue
pos_pct = d['positive'] / n * 100
flat_pct = d['flat'] / n * 100
neg_pct = d['negative'] / n * 100
print(f"{binary+'-domain':<15} {n:<5} {d['positive']} ({pos_pct:.0f}%) {d['flat']} ({flat_pct:.0f}%) {d['negative']} ({neg_pct:.0f}%)")
# Compute the gap
if by_binary['single']['total'] > 0 and by_binary['multi']['total'] > 0:
single_pos_rate = by_binary['single']['positive'] / by_binary['single']['total'] * 100
multi_pos_rate = by_binary['multi']['positive'] / by_binary['multi']['total'] * 100
gap = single_pos_rate - multi_pos_rate
print()
print(f"Cascade-positive rate gap (single - multi): {gap:+.1f} percentage points")
print()
# Apply pre-registered decision rule
if gap >= 20:
verdict = "NARROWNESS HYPOTHESIS SUPPORTED"
detail = f"Single-domain content is {gap:.0f}pp more likely to gain from cascade than multi-domain."
elif gap <= -20:
verdict = "REVERSE OF HYPOTHESIS"
detail = f"Multi-domain content unexpectedly benefits more (counter to prediction)."
elif abs(gap) < 10:
verdict = "HYPOTHESIS NOT SUPPORTED"
detail = "Domain purity does not appear to predict cascade outcome."
else:
verdict = "INCONCLUSIVE"
detail = f"Gap of {gap:+.0f}pp is suggestive but below the pre-registered 20pp threshold."
print(f" Pre-registered decision rule: {verdict}")
print(f" {detail}")
# SECONDARY TEST: Spearman correlation between purity score and predicate delta
print()
print("=" * 100)
print("SECONDARY TEST: Spearman rank correlation (purity score vs predicate delta)")
print("=" * 100)
scores = [j['score'] for j in joined]
deltas_pred = [j['delta_preds'] for j in joined]
deltas_edge = [j['delta_edges'] for j in joined]
rho_pred = spearman(scores, deltas_pred)
rho_edge = spearman(scores, deltas_edge)
print()
print(f" Spearman ρ (purity score vs Δpredicates): {rho_pred:.3f}")
print(f" Spearman ρ (purity score vs Δedges): {rho_edge:.3f}")
print()
if rho_pred is not None:
if rho_pred >= 0.4:
v = "STRONG POSITIVE — narrowness hypothesis supported with monotonic relationship"
elif rho_pred >= 0.2:
v = "WEAK POSITIVE — consistent with hypothesis but not strong evidence"
elif rho_pred <= -0.2:
v = "NEGATIVE — refutes hypothesis"
else:
v = "NO CORRELATION — hypothesis not supported"
print(f" Predicate delta verdict: {v}")
print()
# TERTIARY TEST: within-bucket correlation
print()
print("=" * 100)
print("TERTIARY TEST: Within-bucket correlation")
print("=" * 100)
by_bucket = defaultdict(list)
for j in joined:
by_bucket[j['bucket']].append(j)
print()
print(f"{'Bucket':<12} {'n':<5} {'Single':<10} {'Multi':<10} {'ρ (score vs Δpred)':<22}")
print("-" * 75)
for bucket in ['high', 'mid', 'low', 'document']:
items = by_bucket.get(bucket, [])
if not items:
continue
n = len(items)
n_single = sum(1 for j in items if j['binary'] == 'single')
n_multi = sum(1 for j in items if j['binary'] == 'multi')
if n >= 3:
scores_b = [j['score'] for j in items]
deltas_b = [j['delta_preds'] for j in items]
rho_b = spearman(scores_b, deltas_b)
rho_str = f"{rho_b:+.3f}" if rho_b is not None else "n/a (no variance)"
else:
rho_str = "n/a (too few)"
print(f"{bucket:<12} {n:<5} {n_single:<10} {n_multi:<10} {rho_str}")
# Interaction with bucket: do single/multi outcomes differ within bucket?
print()
print("Per-bucket cascade-positive rate by binary purity:")
print()
print(f"{'Bucket':<12} {'Single':<25} {'Multi':<25}")
print("-" * 65)
for bucket in ['high', 'mid', 'low', 'document']:
items = by_bucket.get(bucket, [])
if not items:
continue
single_items = [j for j in items if j['binary'] == 'single']
multi_items = [j for j in items if j['binary'] == 'multi']
def rate_str(group):
if not group:
return ""
pos = sum(1 for j in group if j['delta_preds'] > 0)
return f"{pos}/{len(group)} positive ({pos/len(group)*100:.0f}%)"
print(f"{bucket:<12} {rate_str(single_items):<25} {rate_str(multi_items):<25}")
# MEAN DELTA by binary group
print()
print("=" * 100)
print("MEAN PREDICATE DELTA BY GROUP")
print("=" * 100)
print()
for binary in ['single', 'multi']:
items = [j for j in joined if j['binary'] == binary]
if not items:
continue
n = len(items)
mean_dp = sum(j['delta_preds'] for j in items) / n
mean_de = sum(j['delta_edges'] for j in items) / n
sum_pp = sum(j['prod_preds'] for j in items)
sum_cp = sum(j['cascade_preds'] for j in items)
pct_change = (sum_cp - sum_pp) / sum_pp * 100 if sum_pp else 0
print(f"{binary}-domain (n={n}):")
print(f" Mean Δpredicates per source: {mean_dp:+.2f}")
print(f" Mean Δedges per source: {mean_de:+.2f}")
print(f" Aggregate predicate change: {sum_pp}{sum_cp} ({pct_change:+.1f}%)")
print()
# Save joined data for the experiments log writeup
out_path = "/home/aaron/aaronai/experiments/e16_joined_analysis.json"
with open(out_path, "w") as f:
json.dump(joined, f, indent=2)
print(f"Joined data saved to {out_path}")
if __name__ == "__main__":
main()
+206
View File
@@ -0,0 +1,206 @@
"""
E1.6 domain-purity rating interface — with full metadata context.
"""
import json
import os
import random
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
RATINGS_OUT = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
INTRO = """
================================================================================
E1.6 — DOMAIN-PURITY RATING
================================================================================
Two ratings per source:
1. BINARY — single-domain (s) or multi-domain (m)?
Mental test: "If Mistral had to pick ONE domain class for this source,
would picking just one significantly UNDER-DESCRIBE the content?"
YES → MULTI-DOMAIN (m) — content lives across two+ frames meaningfully
NO → SINGLE-DOMAIN (s) — content fits cleanly within one frame
2. SCORE (1-5) — how cleanly does it fit?
5 = unambiguously one domain
4 = primarily one domain, slight other element
3 = balanced two-domain
2 = primarily two-domain with traces of a third
1 = three or more domain frames weighted significantly
Single binary usually = score 4-5
Multi binary usually = score 1-3
You see for each source: name, length, AND the full Mistral metadata block
(domain_class, primary_format, structural_signals, content_signals, summary).
Blind to: bucket assignment, cascade outcome.
Commands at any prompt: 's', 'm', 'skip', 'quit'
================================================================================
""".strip()
def load_existing():
if os.path.exists(RATINGS_OUT):
with open(RATINGS_OUT) as f:
return json.load(f)
return {"ratings": [], "completed_names": []}
def save(data):
with open(RATINGS_OUT, "w") as f:
json.dump(data, f, indent=2)
def render_metadata(metadata):
"""Pretty-print the full Mistral metadata block."""
if not isinstance(metadata, dict):
print(" (metadata unavailable)")
return
if 'error' in metadata:
print(f" (metadata error: {metadata['error']})")
return
# Render fields in a stable order
field_order = [
'domain_class',
'primary_format',
'structural_signals',
'content_signals',
'summary',
]
for field in field_order:
if field in metadata:
value = metadata[field]
label = field.replace('_', ' ').title()
if isinstance(value, list):
if value:
print(f" {label}:")
for item in value:
print(f" - {item}")
else:
print(f" {label}: (none)")
elif isinstance(value, str):
# Wrap long strings
if len(value) > 70:
print(f" {label}:")
print(f" {value}")
else:
print(f" {label}: {value}")
else:
print(f" {label}: {value}")
# Show any other fields not in the standard order
other_fields = [k for k in metadata.keys() if k not in field_order and k != 'char_length']
for field in other_fields:
value = metadata[field]
label = field.replace('_', ' ').title()
print(f" {label}: {value}")
def render_source(src, idx, total):
print()
print("=" * 80)
print(f" Source {idx}/{total}")
print("=" * 80)
print(f"Name: {src['name']}")
print(f"Length: {src['doc_chars']:,} chars")
print()
print("Mistral metadata:")
print()
render_metadata(src.get('metadata', {}))
print()
print("-" * 80)
def get_rating():
while True:
binary = input("Single-domain or multi-domain? [s/m/skip/quit]: ").strip().lower()
if binary in ('s', 'm', 'skip', 'quit'):
break
print(" Please enter 's', 'm', 'skip', or 'quit'")
if binary == 'quit':
return 'quit'
if binary == 'skip':
return None
while True:
try:
score_input = input("Purity score (1=many frames, 5=clearly single): ").strip()
if score_input.lower() == 'quit':
return 'quit'
score = int(score_input)
if 1 <= score <= 5:
break
print(" Score must be 1-5")
except ValueError:
print(" Please enter a number 1-5 (or 'quit')")
note = input("Optional note (Enter to skip): ").strip()
return {
"binary": "single" if binary == 's' else "multi",
"score": score,
"note": note if note else None,
}
def main():
with open(E14_RESULTS) as f:
e14 = json.load(f)
sources = [r for r in e14['results'] if 'submit_result' in r]
rng = random.Random(42)
shuffled = list(sources)
rng.shuffle(shuffled)
state = load_existing()
completed = set(state['completed_names'])
remaining = [s for s in shuffled if s['name'] not in completed]
print(INTRO)
print()
print(f"Total sources: {len(sources)}")
print(f"Already rated: {len(completed)}")
print(f"Remaining: {len(remaining)}")
print()
if not remaining:
print("All sources rated. Run analysis script next.")
return
input("Press Enter to begin...")
try:
for i, src in enumerate(remaining, start=len(completed) + 1):
render_source(src, i, len(sources))
try:
rating = get_rating()
except (KeyboardInterrupt, EOFError):
print("\n\nSaving and exiting...")
save(state)
return
if rating == 'quit':
print("\nSaving and exiting...")
save(state)
return
if rating is None:
print(" Skipped")
continue
rating['name'] = src['name']
state['ratings'].append(rating)
state['completed_names'].append(src['name'])
save(state)
print(f" Recorded: {rating['binary']}-domain, score={rating['score']}")
print()
print("=" * 80)
print(f"Done. Rated {len(state['ratings'])} sources.")
print(f"Saved to {RATINGS_OUT}")
except (KeyboardInterrupt, EOFError):
print("\n\nSaving...")
save(state)
if __name__ == "__main__":
main()
+134
View File
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""E1 metrics comparison — A (Tier 1 aaron) vs B (cascade aaron_cascade_test) on the 10 sample sources."""
import json
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
COMPARISON_FILE = EXPERIMENTS / "cascade_reextract_comparison.json"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def parse_int_result(output):
"""Parse a single-integer result from redis-cli GRAPH.QUERY output."""
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
def parse_string_list(output):
"""Parse a list of strings from redis-cli output (skipping headers and timing)."""
lines = [l.strip() for l in output.split("\n") if l.strip()]
items = []
started = False
for line in lines:
if line.startswith("Cached") or line.startswith("Query internal"):
break
if started:
items.append(line)
# The header is the column name; everything after is data
# But we don't know column names a priori, so detect transition by length pattern
if not started and len(line) < 60 and not any(c in line for c in "{}[]"):
# Likely a header row, skip first one
started = True
return items
def metrics_for_source(group_id, source_name):
"""Get metrics for one source's episode in one group_id."""
# Total entities connected to this episode
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity) RETURN count(distinct n) AS entities'
entities = parse_int_result(query(group_id, q))
# Total edges from this episode (all relationship types)
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[r]-() RETURN count(r) AS edges'
edges = parse_int_result(query(group_id, q))
# Distinct relationship types in edges from entities of this episode
q = (f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity)-[r]-() '
f'RETURN count(distinct type(r)) AS types')
rel_types = parse_int_result(query(group_id, q))
return {"entities": entities, "edges": edges, "rel_types": rel_types}
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 metrics comparison — {len(selected)} sources, A=aaron vs B=aaron_cascade_test\n")
print(f"{'Source':<60} {'A.ent':>6} {'B.ent':>6} {'A.edg':>6} {'B.edg':>6} {'A.typ':>6} {'B.typ':>6}")
print("-" * 110)
results = []
for ep in selected:
name = ep["name"]
bucket = ep["bucket"]
a = metrics_for_source("aaron", name)
b = metrics_for_source("aaron_cascade_test", name)
record = {
"name": name, "bucket": bucket,
"a_entities": a["entities"], "b_entities": b["entities"],
"a_edges": a["edges"], "b_edges": b["edges"],
"a_rel_types": a["rel_types"], "b_rel_types": b["rel_types"],
}
results.append(record)
# Truncate name for display
display_name = name if len(name) <= 58 else name[:55] + "..."
print(f"{display_name:<60} {a['entities']:>6} {b['entities']:>6} {a['edges']:>6} {b['edges']:>6} {a['rel_types']:>6} {b['rel_types']:>6}")
# Aggregates
print("\n" + "=" * 110)
n = len(results)
a_ent_sum = sum(r["a_entities"] for r in results)
b_ent_sum = sum(r["b_entities"] for r in results)
a_edge_sum = sum(r["a_edges"] for r in results)
b_edge_sum = sum(r["b_edges"] for r in results)
a_types_sum = sum(r["a_rel_types"] for r in results)
b_types_sum = sum(r["b_rel_types"] for r in results)
print(f"\nAggregate (n={n}):")
print(f" Entities: A mean={a_ent_sum/n:.1f} B mean={b_ent_sum/n:.1f} delta={(b_ent_sum-a_ent_sum)/a_ent_sum*100:+.1f}%")
print(f" Edges: A mean={a_edge_sum/n:.1f} B mean={b_edge_sum/n:.1f} delta={(b_edge_sum-a_edge_sum)/a_edge_sum*100:+.1f}%")
print(f" Rel types: A mean={a_types_sum/n:.1f} B mean={b_types_sum/n:.1f} delta={(b_types_sum-a_types_sum)/a_types_sum*100:+.1f}%")
# Global predicate diversity check (unique types in each group_id)
print(f"\nGlobal predicate diversity:")
a_global = parse_int_result(query("aaron", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
b_global = parse_int_result(query("aaron_cascade_test", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
print(f" A (aaron): {a_global} distinct relationship types across whole graph")
print(f" B (aaron_cascade_test): {b_global} distinct relationship types across whole graph")
# Per-bucket
print(f"\nPer-bucket aggregates:")
for bucket in ["high", "mid", "low", "document"]:
bucket_results = [r for r in results if r["bucket"] == bucket]
if not bucket_results:
continue
bn = len(bucket_results)
a_e = sum(r["a_entities"] for r in bucket_results) / bn
b_e = sum(r["b_entities"] for r in bucket_results) / bn
a_ed = sum(r["a_edges"] for r in bucket_results) / bn
b_ed = sum(r["b_edges"] for r in bucket_results) / bn
print(f" [{bucket:>8}] n={bn} A.ent={a_e:.1f} B.ent={b_e:.1f} ({(b_e-a_e)/a_e*100:+.0f}%) "
f"A.edg={a_ed:.1f} B.edg={b_ed:.1f} ({(b_ed-a_ed)/a_ed*100:+.0f}%)")
with open(COMPARISON_FILE, "w") as f:
json.dump({
"results": results,
"aggregate": {
"a_entities_total": a_ent_sum, "b_entities_total": b_ent_sum,
"a_edges_total": a_edge_sum, "b_edges_total": b_edge_sum,
"global_predicate_diversity": {"a": a_global, "b": b_global},
},
}, f, indent=2)
print(f"\nSaved to {COMPARISON_FILE}")
if __name__ == "__main__":
main()
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
"""E1 corrected metric — count distinct predicate names on edges originating from each episode."""
import json
import subprocess
from pathlib import Path
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
def query(group_id, cypher):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
capture_output=True, text=True
)
return result.stdout
def get_episode_uuid(group_id, episode_name):
"""Look up the UUID for a given episode name in a given group."""
# Escape single quotes in the name
safe = episode_name.replace("'", "\\'")
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
output = query(group_id, cypher)
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
# UUID format check
if len(line) == 36 and line.count("-") == 4:
return line
return None
def count_predicates_for_episode(group_id, uuid):
"""Count distinct predicate names on edges where this episode UUID appears in r.episodes."""
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
output = query(group_id, cypher)
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
def count_total_edges_for_episode(group_id, uuid):
"""Count total edges originating from this episode."""
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
output = query(group_id, cypher)
lines = [l.strip() for l in output.split("\n") if l.strip()]
for line in lines:
if line.isdigit():
return int(line)
return 0
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 corrected per-source comparison — predicates per episode by edge origin\n")
print(f"{'Source':<60} {'A.edges':>8} {'A.preds':>8} {'B.edges':>8} {'B.preds':>8}")
print("-" * 100)
a_pred_total = 0
b_pred_total = 0
a_edge_total = 0
b_edge_total = 0
records = []
for ep in selected:
name = ep["name"]
a_uuid = get_episode_uuid("aaron", name)
b_uuid = get_episode_uuid("aaron_cascade_test", name)
a_edges = count_total_edges_for_episode("aaron", a_uuid) if a_uuid else 0
a_preds = count_predicates_for_episode("aaron", a_uuid) if a_uuid else 0
b_edges = count_total_edges_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
b_preds = count_predicates_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
display = name if len(name) <= 58 else name[:55] + "..."
print(f"{display:<60} {a_edges:>8} {a_preds:>8} {b_edges:>8} {b_preds:>8}")
records.append({
"name": name, "bucket": ep["bucket"],
"a_edges": a_edges, "a_preds": a_preds,
"b_edges": b_edges, "b_preds": b_preds,
})
a_pred_total += a_preds
b_pred_total += b_preds
a_edge_total += a_edges
b_edge_total += b_edges
print("-" * 100)
n = len(selected)
print(f"\nAggregate (n={n}):")
print(f" Edges: A total={a_edge_total} mean={a_edge_total/n:.1f} B total={b_edge_total} mean={b_edge_total/n:.1f}")
print(f" Predicates: A total={a_pred_total} mean={a_pred_total/n:.1f} B total={b_pred_total} mean={b_pred_total/n:.1f}")
if a_pred_total > 0:
print(f" Predicate delta: B vs A = {(b_pred_total-a_pred_total)/a_pred_total*100:+.1f}%")
if a_edge_total > 0:
print(f" Edge delta: B vs A = {(b_edge_total-a_edge_total)/a_edge_total*100:+.1f}%")
# Per-bucket
print(f"\nPer-bucket:")
for bucket in ["high", "mid", "low", "document"]:
bucket_records = [r for r in records if r["bucket"] == bucket]
if not bucket_records:
continue
bn = len(bucket_records)
a_p = sum(r["a_preds"] for r in bucket_records)
b_p = sum(r["b_preds"] for r in bucket_records)
a_e = sum(r["a_edges"] for r in bucket_records)
b_e = sum(r["b_edges"] for r in bucket_records)
delta = ((b_p-a_p)/a_p*100) if a_p > 0 else 0
print(f" [{bucket:>8}] n={bn} A.preds={a_p:>3} B.preds={b_p:>3} ({delta:+.0f}%) A.edges={a_e:>3} B.edges={b_e:>3}")
with open(EXPERIMENTS / "cascade_reextract_corrected_comparison.json", "w") as f:
json.dump({"per_source": records,
"aggregate": {"a_preds": a_pred_total, "b_preds": b_pred_total,
"a_edges": a_edge_total, "b_edges": b_edge_total}}, f, indent=2)
print(f"\nSaved to {EXPERIMENTS / 'cascade_reextract_corrected_comparison.json'}")
+190
View File
@@ -0,0 +1,190 @@
#!/usr/bin/env python3
"""E1 orchestration — fetch source text, run Mistral metadata, submit to Graphiti test group_id."""
import json
import os
import requests
import subprocess
import time
from pathlib import Path
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
PG_DSN = os.environ["PG_DSN"]
SIDECAR_URL = "http://localhost:8001"
TEST_GROUP_ID = "aaron_cascade_test"
MAX_DOC_CHARS = 12000 # Same cap as Tier 1 for parity
# Stage 2 metadata prompt — verbatim from stage-2-worker-spec.md
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
def get_pg():
return psycopg2.connect(PG_DSN)
def fetch_source_text(source):
"""Reassemble the full document from pgvector chunks, mirroring tier1_migration.py logic."""
conn = get_pg()
cur = conn.cursor()
cur.execute("""
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
conn.close()
if row is None or row[0] is None:
return None
return row[0]
def run_mistral_metadata(text):
"""Call local Mistral via Ollama for base-class metadata."""
truncated = text[:MAX_DOC_CHARS]
prompt = METADATA_PROMPT + truncated
response = requests.post(
"http://localhost:11434/api/generate",
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
timeout=180,
)
response.raise_for_status()
raw = response.json()["response"]
try:
metadata = json.loads(raw)
# Override char_length with python-computed value (per stage-2-worker-spec)
metadata["char_length"] = len(truncated)
return metadata
except json.JSONDecodeError:
return {"error": "JSON parse failed", "raw": raw[:500]}
def format_metadata_as_orientation(metadata):
"""Format the base-class metadata as a source_description for Graphiti, with orient-not-bound framing."""
if "error" in metadata:
return f"tier1_cascade_test (metadata generation failed: {metadata['error']})"
summary = metadata.get("one_sentence_summary", "")
domain = metadata.get("domain_class", "unknown")
fmt = metadata.get("primary_format", "unknown")
return (
f"This is a {domain} document in {fmt} format. "
f"Summary: {summary} "
f"This metadata is provided to orient your extraction, not to constrain it. "
f"Extract entities and relationships freely from the document text itself; "
f"the metadata is descriptive context, not a checklist."
)
def submit_episode(name, content, source_description):
"""Submit episode to Graphiti sidecar at the test group_id."""
payload = {
"episodes": [{
"name": name,
"content": content[:MAX_DOC_CHARS],
"source_description": source_description,
"timestamp": "2026-04-28T00:00:00",
}],
"group_id": TEST_GROUP_ID,
}
response = requests.post(f"{SIDECAR_URL}/episodes/bulk", json=payload, timeout=300)
response.raise_for_status()
return response.json()
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 cascade re-extraction starting — {len(selected)} episodes to test group_id={TEST_GROUP_ID}\n")
results = []
for i, ep in enumerate(selected, 1):
name = ep["name"]
bucket = ep["bucket"]
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
# Fetch text
print(f" Fetching source text...", end=" ", flush=True)
text = fetch_source_text(name)
if text is None:
print("FAILED — no chunks in pgvector")
record["error"] = "no source text"
results.append(record)
continue
record["doc_chars"] = len(text)
print(f"{len(text)} chars")
# Mistral metadata
print(f" Generating Mistral metadata...", end=" ", flush=True)
t0 = time.time()
metadata = run_mistral_metadata(text)
elapsed = time.time() - t0
record["metadata"] = metadata
record["metadata_elapsed_s"] = round(elapsed, 1)
if "error" in metadata:
print(f"FAILED in {elapsed:.1f}s")
else:
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
# Submit to Graphiti
source_desc = format_metadata_as_orientation(metadata)
record["source_description"] = source_desc
print(f" Submitting to Graphiti test group...", end=" ", flush=True)
t0 = time.time()
try:
result = submit_episode(name, text, source_desc)
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — OK")
record["submit_elapsed_s"] = round(elapsed, 1)
record["submit_result"] = result
except Exception as e:
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — FAILED: {e}")
record["submit_error"] = str(e)
results.append(record)
# Save intermediate state after each episode
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
print()
print(f"\nDone. Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""E1 corrected re-run — cascade orientation passed via custom_extraction_instructions."""
import json
import os
import requests
import time
from pathlib import Path
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
PG_DSN = os.environ["PG_DSN"]
SIDECAR_URL = "http://localhost:8001"
TEST_GROUP_ID = "aaron_cascade_test"
MAX_DOC_CHARS = 12000
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
Output JSON only. No prose, no explanation, no markdown code fences.
Schema:
{
"language": "<ISO 639-1 code>",
"char_length": <integer>,
"primary_format": "<prose|slides|code|structured|mixed>",
"structural_signals": {
"has_headings": <boolean>,
"has_bullet_lists": <boolean>,
"has_numbered_lists": <boolean>,
"has_tables": <boolean>,
"has_code_blocks": <boolean>,
"has_dates": <boolean>
},
"content_signals": {
"has_named_people": <boolean>,
"has_institutional_language": <boolean>,
"has_technical_terminology": <boolean>,
"has_first_person": <boolean>,
"has_quotations": <boolean>
},
"domain_class": "<technical|administrative|educational|personal|conversational>",
"one_sentence_summary": "<one sentence describing what the document is about>"
}
Document:
"""
def get_pg():
return psycopg2.connect(PG_DSN)
def fetch_source_text(source):
conn = get_pg()
cur = conn.cursor()
cur.execute("""
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
FROM embeddings WHERE source = %s
""", (source,))
row = cur.fetchone()
conn.close()
if row is None or row[0] is None:
return None
return row[0]
def run_mistral_metadata(text):
truncated = text[:MAX_DOC_CHARS]
prompt = METADATA_PROMPT + truncated
response = requests.post(
"http://localhost:11434/api/generate",
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
timeout=180,
)
response.raise_for_status()
raw = response.json()["response"]
try:
metadata = json.loads(raw)
metadata["char_length"] = len(truncated)
return metadata
except json.JSONDecodeError:
return {"error": "JSON parse failed", "raw": raw[:500]}
def format_metadata_as_orientation(metadata):
"""Format metadata as orient-not-bound extraction instructions."""
if "error" in metadata:
return None
summary = metadata.get("one_sentence_summary", "")
domain = metadata.get("domain_class", "unknown")
fmt = metadata.get("primary_format", "unknown")
return (
f"This is a {domain} document in {fmt} format. "
f"Summary: {summary} "
f"This metadata is provided to orient your extraction, not to constrain it. "
f"Extract entities and relationships freely from the document text itself; "
f"the metadata is descriptive context, not a checklist."
)
def submit_episode_singular(name, content, custom_instructions):
"""Submit episode to Graphiti's singular /episodes endpoint with cascade orientation."""
payload = {
"name": name,
"content": content[:MAX_DOC_CHARS],
"source_description": "e1_corrected_run", # neutral label, not the cascade text
"timestamp": "2026-04-28T00:00:00",
"group_id": TEST_GROUP_ID,
"custom_extraction_instructions": custom_instructions,
}
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
response.raise_for_status()
return response.json()
def main():
with open(SAMPLE_FILE) as f:
sample = json.load(f)
selected = sample["selected"]
print(f"E1 CORRECTED re-run — {len(selected)} episodes via /episodes (singular)")
print(f"Cascade orientation passed in custom_extraction_instructions.\n")
results = []
for i, ep in enumerate(selected, 1):
name = ep["name"]
bucket = ep["bucket"]
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
print(f" Fetching source text...", end=" ", flush=True)
text = fetch_source_text(name)
if text is None:
print("FAILED — no chunks in pgvector")
record["error"] = "no source text"
results.append(record)
continue
record["doc_chars"] = len(text)
print(f"{len(text)} chars")
print(f" Generating Mistral metadata...", end=" ", flush=True)
t0 = time.time()
metadata = run_mistral_metadata(text)
elapsed = time.time() - t0
record["metadata"] = metadata
record["metadata_elapsed_s"] = round(elapsed, 1)
if "error" in metadata:
print(f"FAILED in {elapsed:.1f}s")
else:
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
custom_instructions = format_metadata_as_orientation(metadata)
record["custom_extraction_instructions"] = custom_instructions
print(f" Submitting via /episodes (singular) with custom_extraction_instructions...", end=" ", flush=True)
t0 = time.time()
try:
result = submit_episode_singular(name, text, custom_instructions)
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — OK")
record["submit_elapsed_s"] = round(elapsed, 1)
record["submit_result"] = result
except Exception as e:
elapsed = time.time() - t0
print(f"{elapsed:.1f}s — FAILED: {e}")
record["submit_error"] = str(e)
results.append(record)
with open(RESULTS_FILE, "w") as f:
json.dump({"results": results}, f, indent=2, default=str)
print()
print(f"\nDone. Results saved to {RESULTS_FILE}")
if __name__ == "__main__":
main()
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""E1 sample selection — pick 10 episodes from Tier 1 stratified by density and type."""
import json
import os
import subprocess
from pathlib import Path
from collections import defaultdict
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
OUTPUT = EXPERIMENTS / "cascade_reextract_sample.json"
# Get all Tier 1 episodes with their entity counts via FalkorDB
def query_episode_counts():
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
"RETURN e.name AS name, count(distinct n) AS entities "
"ORDER BY entities DESC")
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
# Parse the output — redis-cli returns rows after a header
lines = [l for l in result.stdout.split("\n") if l.strip()]
episodes = []
# Skip header rows ("name", "entities") and timing rows
i = 0
while i < len(lines):
if lines[i] == "name":
i += 2 # skip "name" and "entities" headers
continue
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
break
# Each episode: name on one line, count on next
if i + 1 < len(lines):
try:
count = int(lines[i + 1])
episodes.append({"name": lines[i], "entities": count})
i += 2
except ValueError:
i += 1
else:
i += 1
return episodes
print("Fetching episode entity counts from FalkorDB...")
episodes = query_episode_counts()
print(f"Got {len(episodes)} episodes")
# Classify by density bucket and type
def is_document(name):
doc_extensions = (".pdf", ".docx", ".pptx", ".txt", ".md")
return any(name.lower().endswith(ext) for ext in doc_extensions)
# Compute quartile boundaries from the entity counts
counts = sorted([e["entities"] for e in episodes], reverse=True)
n = len(counts)
top_q = counts[n // 4] # 25th percentile from top
bottom_q = counts[3 * n // 4] # 75th percentile from top
print(f"\nQuartile boundaries: top={top_q}+, middle=({bottom_q+1}-{top_q-1}), bottom=0-{bottom_q}")
high = [e for e in episodes if e["entities"] >= top_q and not is_document(e["name"])]
mid = [e for e in episodes if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
low = [e for e in episodes if e["entities"] <= bottom_q and not is_document(e["name"])]
docs = [e for e in episodes if is_document(e["name"]) and e["entities"] >= 5]
print(f"High-density conversations: {len(high)}")
print(f"Mid-density conversations: {len(mid)}")
print(f"Low-density conversations: {len(low)}")
print(f"Documents (≥5 entities): {len(docs)}")
# Deterministic selection — take from middle of each bucket to avoid edge cases
def pick(bucket, n):
if len(bucket) < n:
return bucket
mid_idx = len(bucket) // 2
start = max(0, mid_idx - n // 2)
return bucket[start:start + n]
selected = (
pick(high, 3) +
pick(mid, 3) +
pick(low, 2) +
pick(docs, 2)
)
# Tag each with its bucket
def bucket_for(ep):
if is_document(ep["name"]):
return "document"
if ep["entities"] >= top_q:
return "high"
if ep["entities"] > bottom_q:
return "mid"
return "low"
for ep in selected:
ep["bucket"] = bucket_for(ep)
print(f"\nSelected {len(selected)} episodes for E1:")
for ep in selected:
print(f" [{ep['bucket']:>8}] {ep['entities']:>3}e {ep['name']}")
# Save selection
with open(OUTPUT, "w") as f:
json.dump({
"metadata": {
"purpose": "E1 cascade re-extraction sample (n=10)",
"stratification": "density buckets + document subset",
"quartile_top": top_q,
"quartile_bottom": bottom_q,
"total_tier1_episodes": len(episodes),
},
"selected": selected,
}, f, indent=2)
print(f"\nSaved to {OUTPUT}")
+24
View File
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
"""E2 follow-up: confirm Aaron AI alias situation, find other potential duplicates."""
import subprocess
QUERIES = [
("Aaron AI variants",
"MATCH (n:Entity) WHERE n.name CONTAINS 'Aaron AI' OR n.name CONTAINS 'ARIN' OR n.name CONTAINS 'RNAI' RETURN n.name, n.summary"),
("All Mossygear-named entities",
"MATCH (n:Entity) WHERE n.name CONTAINS 'Mossy' OR n.name CONTAINS 'A+K' OR n.name CONTAINS 'AK Design' RETURN n.name, n.summary"),
("Total entity count check",
"MATCH (n:Entity) RETURN count(n) as total"),
("Top 30 entity names by edge count",
"MATCH (n:Entity)-[r]-() RETURN n.name, count(r) as edges ORDER BY edges DESC LIMIT 30"),
]
for label, query in QUERIES:
print(f"\n{'=' * 60}")
print(f"QUERY: {label}")
print('=' * 60)
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
print(result.stdout)
@@ -0,0 +1,20 @@
#!/usr/bin/env python3
"""E2: Entity resolution diagnostic. Queries Graphiti's FalkorDB for the six test entities."""
import subprocess
import sys
TEST_ENTITIES = ["Aaron", "Kat", "HVAMC", "Bird", "Susan Hamlet", "Tulsa album"]
def run_cypher(query):
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
return result.stdout
for name in TEST_ENTITIES:
print(f"\n{'=' * 60}")
print(f"ENTITY: {name}")
print('=' * 60)
query = f"MATCH (n:Entity) WHERE n.name CONTAINS '{name}' RETURN n.name, n.summary"
print(run_cypher(query))
@@ -0,0 +1,24 @@
#!/usr/bin/env python3
"""E2 follow-up: how many distinct episodes connect to each entity?"""
import subprocess
QUERIES = [
("Aaron", "MATCH (n:Entity {name: 'Aaron'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Nelson", "MATCH (n:Entity {name: 'Nelson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("HVAMC", "MATCH (n:Entity {name: 'HVAMC'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Bird", "MATCH (n:Entity {name: 'Bird'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Tulsa album", "MATCH (n:Entity {name: 'Tulsa album'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Susan Hamlet", "MATCH (n:Entity {name: 'Susan Hamlet'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Kat", "MATCH (n:Entity {name: 'Kat'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
("Katherine Wilson","MATCH (n:Entity {name: 'Katherine Wilson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
]
for label, query in QUERIES:
print(f"\n{'=' * 60}")
print(f"ENTITY: {label}")
print('=' * 60)
result = subprocess.run(
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
capture_output=True, text=True
)
print(result.stdout)
@@ -0,0 +1,257 @@
#!/usr/bin/env python3
"""
Experiment 005 — Actual API Token Measurement
Measures input token reduction from prepending v2 briefing vs raw document
on Claude Haiku, validating the 42.0% modeled estimate from Experiment 002b.
Outputs: ~/aaronai/experiments/token_measurement_results.json
"""
import json
import os
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import anthropic
import psycopg2
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
INPUT_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "token_measurement_results.json"
MODEL = "claude-haiku-4-5-20251001"
MAX_TOKENS = 1024
EXTRACTION_PROMPT = (
"Extract entities and their relationships from the document below. "
"Return ONLY valid JSON with this schema:\n"
"{\n"
' "people": [string],\n'
' "organizations": [string],\n'
' "locations": [string],\n'
' "dates": [string],\n'
' "relationships": [{"subject": string, "predicate": string, "object": string}]\n'
"}\n"
"No prose, no markdown fences, no commentary. JSON only."
)
def fetch_document_text(pg_conn, source):
"""Reconstruct the document by concatenating its chunks from pgvector."""
cur = pg_conn.cursor()
cur.execute(
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
(source,),
)
rows = cur.fetchall()
cur.close()
if not rows:
return None
return "\n\n".join(r[0] for r in rows)
def build_raw_message(document_text):
return f"{EXTRACTION_PROMPT}\n\nDOCUMENT:\n{document_text}"
def build_briefed_message(briefing, document_text):
briefing_str = json.dumps(briefing, indent=2)
return (
f"{EXTRACTION_PROMPT}\n\n"
f"BRIEFING (pre-analysis from local model — use to orient):\n{briefing_str}\n\n"
f"DOCUMENT:\n{document_text}"
)
def call_haiku(client, message_text):
t0 = time.time()
resp = client.messages.create(
model=MODEL,
max_tokens=MAX_TOKENS,
messages=[{"role": "user", "content": message_text}],
)
return {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
"latency_s": round(time.time() - t0, 2),
"response_text": resp.content[0].text if resp.content else "",
"stop_reason": resp.stop_reason,
}
def ci_95(values):
if len(values) < 2:
return (statistics.mean(values) if values else 0.0, 0.0)
mean = statistics.mean(values)
half = 1.96 * statistics.stdev(values) / (len(values) ** 0.5)
return (mean, half)
def main():
if not INPUT_FILE.exists():
print(f"ERROR: {INPUT_FILE} not found", file=sys.stderr)
sys.exit(1)
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
sys.exit(1)
pg_dsn = os.environ.get("PG_DSN")
if not pg_dsn:
print("ERROR: PG_DSN not set", file=sys.stderr)
sys.exit(1)
client = anthropic.Anthropic(api_key=api_key)
pg_conn = psycopg2.connect(pg_dsn)
with open(INPUT_FILE) as f:
v2_data = json.load(f)
docs_meta = [
d for d in v2_data["documents"]
if d.get("status") == "SUCCESS"
and d.get("briefing")
]
print(f"Loaded {len(docs_meta)} successful briefings from {INPUT_FILE.name}")
print(f"Model: {MODEL}")
print(f"Calls planned: up to {len(docs_meta) * 2}\n")
results = []
started_at = datetime.now(timezone.utc).isoformat()
t_total = time.time()
for i, doc in enumerate(docs_meta, 1):
source = doc["source"]
briefing = doc["briefing"]
document_text = fetch_document_text(pg_conn, source)
if not document_text:
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]} -- SKIP (not in pgvector)")
results.append({"source": source, "skipped": "not_in_pgvector"})
continue
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]}")
try:
raw_result = call_haiku(client, build_raw_message(document_text))
except Exception as e:
print(f" RAW FAILED: {e}")
raw_result = {"error": str(e)}
try:
briefed_result = call_haiku(client, build_briefed_message(briefing, document_text))
except Exception as e:
print(f" BRIEFED FAILED: {e}")
briefed_result = {"error": str(e)}
delta = None
if "input_tokens" in raw_result and "input_tokens" in briefed_result:
raw_in = raw_result["input_tokens"]
briefed_in = briefed_result["input_tokens"]
raw_out = raw_result["output_tokens"]
briefed_out = briefed_result["output_tokens"]
input_red = (raw_in - briefed_in) / raw_in * 100 if raw_in else 0.0
output_delta = (briefed_out - raw_out) / raw_out * 100 if raw_out else 0.0
delta = {
"input_reduction_pct": round(input_red, 2),
"output_delta_pct": round(output_delta, 2),
"raw_input_tokens": raw_in,
"briefed_input_tokens": briefed_in,
"raw_output_tokens": raw_out,
"briefed_output_tokens": briefed_out,
}
print(
f" in: {raw_in} -> {briefed_in} ({input_red:+.1f}%) | "
f"out: {raw_out} -> {briefed_out}"
)
results.append({
"source": source,
"raw": raw_result,
"briefed": briefed_result,
"delta": delta,
})
pg_conn.close()
total_elapsed = round(time.time() - t_total, 1)
valid = [r for r in results if r.get("delta") is not None]
skipped = [r for r in results if r.get("skipped")]
reductions = [r["delta"]["input_reduction_pct"] for r in valid]
output_deltas = [r["delta"]["output_delta_pct"] for r in valid]
raw_in_total = sum(r["delta"]["raw_input_tokens"] for r in valid)
briefed_in_total = sum(r["delta"]["briefed_input_tokens"] for r in valid)
raw_out_total = sum(r["delta"]["raw_output_tokens"] for r in valid)
briefed_out_total = sum(r["delta"]["briefed_output_tokens"] for r in valid)
HAIKU_IN = 1.0
HAIKU_OUT = 5.0
raw_cost = (raw_in_total * HAIKU_IN + raw_out_total * HAIKU_OUT) / 1_000_000
briefed_cost = (briefed_in_total * HAIKU_IN + briefed_out_total * HAIKU_OUT) / 1_000_000
mean_red, ci_half = ci_95(reductions)
mean_out_delta, _ = ci_95(output_deltas)
summary = {
"experiment": "005",
"title": "Actual API Token Measurement",
"started_at": started_at,
"completed_at": datetime.now(timezone.utc).isoformat(),
"model": MODEL,
"extraction_prompt": EXTRACTION_PROMPT,
"n_documents_attempted": len(docs_meta),
"n_skipped_not_in_pgvector": len(skipped),
"n_valid_pairs": len(valid),
"n_failed": len(docs_meta) - len(valid) - len(skipped),
"total_elapsed_s": total_elapsed,
"input_token_reduction": {
"mean_pct": round(mean_red, 2),
"ci_95_half_width_pct": round(ci_half, 2),
"median_pct": round(statistics.median(reductions), 2) if reductions else None,
"min_pct": round(min(reductions), 2) if reductions else None,
"max_pct": round(max(reductions), 2) if reductions else None,
"stdev_pct": round(statistics.stdev(reductions), 2) if len(reductions) > 1 else 0.0,
},
"output_token_delta": {"mean_pct": round(mean_out_delta, 2)},
"totals": {
"raw_input_tokens": raw_in_total,
"briefed_input_tokens": briefed_in_total,
"raw_output_tokens": raw_out_total,
"briefed_output_tokens": briefed_out_total,
"raw_cost_usd": round(raw_cost, 4),
"briefed_cost_usd": round(briefed_cost, 4),
"savings_usd": round(raw_cost - briefed_cost, 4),
},
"comparison_to_v2_estimate": {
"v2_modeled_reduction_pct": 42.0,
"measured_mean_reduction_pct": round(mean_red, 2),
"delta_pct_points": round(mean_red - 42.0, 2),
},
"results": results,
}
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, "w") as f:
json.dump(summary, f, indent=2)
print()
print("=" * 60)
print(f"DONE — {len(valid)}/{len(docs_meta)} valid pairs in {total_elapsed}s")
if skipped:
print(f"Skipped (not in pgvector): {len(skipped)}")
print(f"Mean input token reduction: {mean_red:.2f}% +/- {ci_half:.2f}% (95% CI)")
print(f"V2 modeled estimate: 42.0% | delta: {mean_red - 42.0:+.2f} pts")
print(f"Mean output token delta: {mean_out_delta:+.2f}%")
print(f"Total cost: ${raw_cost + briefed_cost:.4f}")
print(f"Results: {OUTPUT_FILE}")
if __name__ == "__main__":
main()