add experiment scripts and results; watcher.py latest changes
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Audit Expansion Pack Generator — type-aware stratified draw of 12
|
||||
documents from base_class_validation_results.json for n=20 audit expansion.
|
||||
|
||||
Per audit-expansion-protocol.md amendment 2026-04-28:
|
||||
The seed=43 length-only random draw concentrated on course modules in the
|
||||
small and medium buckets, missing voice captures, syllabi, and
|
||||
conversational documents present in the candidate distribution.
|
||||
This script implements type-aware stratification within each length
|
||||
bucket to produce a sample representative of BirdAI's document-type mix.
|
||||
|
||||
Targets (12 total):
|
||||
small (4): 2 course_module + 2 voice_capture
|
||||
medium (4): 2 course_module + 1 syllabus + 1 other
|
||||
large (4): 1 course_ppt + 1 syllabus + 1 faculty_report + 1 conversational
|
||||
|
||||
Output: ~/aaronai/experiments/audit_expansion_pack.json
|
||||
|
||||
Usage:
|
||||
python3 ~/aaronai/scripts/audit_expansion_draw.py
|
||||
python3 ~/aaronai/scripts/audit_expansion_draw.py --dry-run
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
VALIDATION_RESULTS = EXPERIMENTS / "base_class_validation_results.json"
|
||||
EXISTING_AUDIT_PACK = EXPERIMENTS / "base_class_audit_pack.json"
|
||||
OUTPUT_FILE = EXPERIMENTS / "audit_expansion_pack.json"
|
||||
|
||||
SEED = 43
|
||||
|
||||
# Type-aware targets per bucket
|
||||
TYPE_TARGETS = {
|
||||
"small": {"course_module": 2, "voice_capture": 2},
|
||||
"medium": {"course_module": 2, "syllabus": 1, "other": 1},
|
||||
"large": {"course_ppt": 1, "syllabus": 1, "faculty_report": 1, "conversational": 1},
|
||||
}
|
||||
|
||||
|
||||
def classify(source, bucket):
|
||||
"""Map a source filename to a document type, scoped to bucket where
|
||||
type categories overlap (e.g., 'course_module' vs 'course_ppt')."""
|
||||
s = source.lower()
|
||||
|
||||
# Voice captures — pattern: YYYY-MM-DD-HH-MM-voice.md
|
||||
if re.match(r"\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-voice\.md$", source):
|
||||
return "voice_capture"
|
||||
|
||||
# Conversational exports — pattern: "Claude: ..." or "ChatGPT: ..."
|
||||
if source.startswith("Claude:") or source.startswith("ChatGPT:"):
|
||||
return "conversational"
|
||||
|
||||
# Syllabus — must contain "syllabus" in the name
|
||||
if "syllabus" in s:
|
||||
return "syllabus"
|
||||
|
||||
# Faculty / annual reports
|
||||
if "faculty report" in s or "annual report" in s:
|
||||
return "faculty_report"
|
||||
|
||||
# Course PPTs (large bucket) — pattern: "_PPT_" or "_v3.pptx" or "Mod0N_"
|
||||
if bucket == "large" and (".pptx" in s or "_ppt_" in s or re.match(r"mod\d+_", s)):
|
||||
return "course_ppt"
|
||||
|
||||
# Course modules (small/medium bucket) — pattern: "0N_*.docx" or numeric prefix
|
||||
if re.match(r"^\d{2}_", source):
|
||||
return "course_module"
|
||||
|
||||
# Everything else falls into 'other' for medium; not used in small/large targets
|
||||
return "other"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not VALIDATION_RESULTS.exists():
|
||||
print(f"ERROR: {VALIDATION_RESULTS} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
with open(VALIDATION_RESULTS) as f:
|
||||
validation = json.load(f)
|
||||
|
||||
all_docs = validation["results"]
|
||||
print(f"Loaded {len(all_docs)} documents from validation results")
|
||||
print(f"Experiment: {validation.get('title', 'unknown')}")
|
||||
|
||||
# Load existing audit pack to exclude its sources (audit pack uses 'pairs')
|
||||
excluded_sources = set()
|
||||
if EXISTING_AUDIT_PACK.exists():
|
||||
with open(EXISTING_AUDIT_PACK) as f:
|
||||
existing = json.load(f)
|
||||
existing_pairs = existing.get("pairs", existing.get("results", existing))
|
||||
for doc in existing_pairs:
|
||||
src = doc.get("source")
|
||||
if src:
|
||||
excluded_sources.add(src)
|
||||
print(f"Excluding {len(excluded_sources)} sources already in audit pack")
|
||||
|
||||
# Filter to valid candidates
|
||||
valid_docs = []
|
||||
for doc in all_docs:
|
||||
src = doc.get("source")
|
||||
if src in excluded_sources:
|
||||
continue
|
||||
if not doc.get("condition_a") or not doc.get("condition_b"):
|
||||
continue
|
||||
bucket = doc.get("size_bucket")
|
||||
if bucket not in TYPE_TARGETS:
|
||||
continue
|
||||
doc["_type"] = classify(src, bucket)
|
||||
valid_docs.append(doc)
|
||||
|
||||
print(f"Valid candidate documents: {len(valid_docs)}")
|
||||
|
||||
# Print what's available per (bucket, type) before drawing
|
||||
print(f"\nCandidates by (bucket, type):")
|
||||
for bucket in TYPE_TARGETS:
|
||||
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
|
||||
types_in_bucket = {}
|
||||
for d in bucket_docs:
|
||||
types_in_bucket.setdefault(d["_type"], []).append(d)
|
||||
print(f" {bucket}:")
|
||||
for t in sorted(types_in_bucket.keys()):
|
||||
target = TYPE_TARGETS[bucket].get(t, "—")
|
||||
print(f" {t:>16}: {len(types_in_bucket[t])} avail, target {target}")
|
||||
|
||||
# Stratified type-aware draw
|
||||
random.seed(SEED)
|
||||
drawn = []
|
||||
warnings = []
|
||||
for bucket, type_targets in TYPE_TARGETS.items():
|
||||
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
|
||||
for doc_type, target in type_targets.items():
|
||||
type_docs = [d for d in bucket_docs if d["_type"] == doc_type]
|
||||
if len(type_docs) < target:
|
||||
msg = (f"WARNING: bucket={bucket} type={doc_type} "
|
||||
f"available={len(type_docs)} target={target}")
|
||||
warnings.append(msg)
|
||||
print(msg, file=sys.stderr)
|
||||
n_to_draw = min(target, len(type_docs))
|
||||
sample = random.sample(type_docs, n_to_draw)
|
||||
drawn.extend(sample)
|
||||
|
||||
# Report draw
|
||||
print(f"\nDrew {len(drawn)} documents:")
|
||||
for d in drawn:
|
||||
src = d.get("source", "<unknown>")
|
||||
chars = d.get("doc_chars_original", 0)
|
||||
bucket = d.get("size_bucket", "?")
|
||||
doc_type = d.get("_type", "?")
|
||||
truncated = " (TRUNCATED)" if d.get("truncated") else ""
|
||||
print(f" [{bucket:>6}/{doc_type:>16}] {chars:>6}c {src}{truncated}")
|
||||
|
||||
# Bucket-level summary
|
||||
bucket_counts = {"small": 0, "medium": 0, "large": 0}
|
||||
for d in drawn:
|
||||
bucket_counts[d["size_bucket"]] += 1
|
||||
print(f"\nBucket totals: {bucket_counts}")
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\n--dry-run set, not writing output file")
|
||||
return
|
||||
|
||||
output = {
|
||||
"metadata": {
|
||||
"generated_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"source_validation_file": str(VALIDATION_RESULTS),
|
||||
"seed": SEED,
|
||||
"stratification": "type-aware within length bucket",
|
||||
"type_targets": TYPE_TARGETS,
|
||||
"bucket_counts": bucket_counts,
|
||||
"excluded_count": len(excluded_sources),
|
||||
"warnings": warnings,
|
||||
"purpose": "n=20 audit expansion per audit-expansion-protocol.md (type-aware amendment)",
|
||||
},
|
||||
"results": drawn,
|
||||
}
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(output, f, indent=2, default=str)
|
||||
print(f"\nWrote {OUTPUT_FILE}")
|
||||
print(f" {len(drawn)} documents ready for rating")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,605 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base-Class Enrichment Test — OOP Framing Experiment
|
||||
|
||||
Tests whether non-entity metadata from a local model (domain class, structural
|
||||
signals, presence flags, length, summary) can take load off the API without
|
||||
constraining what it extracts.
|
||||
|
||||
The local model does NOT draft entities. The API still does full extraction.
|
||||
The local model produces metadata that orients the API's reading.
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction, no metadata
|
||||
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
|
||||
|
||||
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
|
||||
If B produces fewer edges or less predicate diversity, metadata is acting as
|
||||
constraint and the OOP framing is falsified.
|
||||
|
||||
Sample: 50 docs from briefing_test_v2_results.json:
|
||||
- 15 small (<1000 chars)
|
||||
- 25 medium (1000-5000 chars)
|
||||
- 10 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Outputs: ~/aaronai/experiments/base_class_audit_rerun_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_audit_rerun_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 8192
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"language": "en or other",
|
||||
"char_length": integer,
|
||||
"primary_format": "prose, presentation, list, form, code, or mixed",
|
||||
"structural_signals": {
|
||||
"has_headings": boolean,
|
||||
"has_bullet_lists": boolean,
|
||||
"has_numbered_lists": boolean,
|
||||
"has_tables": boolean,
|
||||
"has_code_blocks": boolean,
|
||||
"has_dates": boolean
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": boolean,
|
||||
"has_institutional_language": boolean,
|
||||
"has_technical_terminology": boolean,
|
||||
"has_first_person": boolean,
|
||||
"has_quotations": boolean
|
||||
},
|
||||
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
|
||||
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
|
||||
}
|
||||
|
||||
JSON only, no commentary.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
|
||||
|
||||
DOCUMENT METADATA:
|
||||
{metadata_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local_metadata(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_METADATA_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph_full(raw):
|
||||
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None, False
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None, False
|
||||
if not isinstance(data, dict):
|
||||
return None, None, False
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return ents, edges, True
|
||||
return None, None, False
|
||||
|
||||
|
||||
def parse_metadata(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def graph_metrics(entities, edges):
|
||||
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
|
||||
if entities is None or edges is None:
|
||||
return None
|
||||
n_entities = len(entities)
|
||||
n_edges = len(edges)
|
||||
|
||||
# Predicate diversity
|
||||
predicates = set()
|
||||
for e in edges:
|
||||
if isinstance(e, dict):
|
||||
p = e.get("predicate")
|
||||
if p:
|
||||
predicates.add(str(p).strip().lower())
|
||||
predicate_diversity = len(predicates)
|
||||
|
||||
# Entity type diversity
|
||||
types = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
t = ent.get("type")
|
||||
if t:
|
||||
types.add(str(t).strip().lower())
|
||||
type_diversity = len(types)
|
||||
|
||||
# Average degree (edges*2 / entities — each edge touches two nodes)
|
||||
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
|
||||
|
||||
# Largest connected component
|
||||
# Build adjacency from edges
|
||||
entity_names = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
n = ent.get("name")
|
||||
if n:
|
||||
entity_names.add(str(n).strip().lower())
|
||||
|
||||
adj = {name: set() for name in entity_names}
|
||||
for e in edges:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
s = str(e.get("subject", "")).strip().lower()
|
||||
o = str(e.get("object", "")).strip().lower()
|
||||
if s in adj and o in adj:
|
||||
adj[s].add(o)
|
||||
adj[o].add(s)
|
||||
|
||||
# BFS for largest component
|
||||
visited = set()
|
||||
largest = 0
|
||||
for start in adj:
|
||||
if start in visited:
|
||||
continue
|
||||
component = 0
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in visited:
|
||||
continue
|
||||
visited.add(node)
|
||||
component += 1
|
||||
for neighbor in adj[node]:
|
||||
if neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if component > largest:
|
||||
largest = component
|
||||
|
||||
return {
|
||||
"n_entities": n_entities,
|
||||
"n_edges": n_edges,
|
||||
"predicate_diversity": predicate_diversity,
|
||||
"type_diversity": type_diversity,
|
||||
"avg_degree": round(avg_degree, 2),
|
||||
"largest_component": largest,
|
||||
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Audit re-run: load the 10 audit docs from base_class_audit_pack.json."""
|
||||
import json as _json
|
||||
audit_file = Path.home() / "aaronai" / "experiments" / "base_class_audit_pack.json"
|
||||
if not audit_file.exists():
|
||||
print(f"ERROR: {audit_file} not found")
|
||||
return []
|
||||
audit = _json.loads(audit_file.read_text())
|
||||
audit_sources = [p["source"] for p in audit["pairs"]]
|
||||
|
||||
# Synthesize doc_meta entries for the audit sources
|
||||
sample = [{"source": s, "content_length": 0, "status": "SUCCESS"}
|
||||
for s in audit_sources]
|
||||
print(f"Audit re-run: {len(sample)} docs from base_class_audit_pack.json")
|
||||
return sample
|
||||
|
||||
|
||||
def fmt_metrics(m):
|
||||
if m is None:
|
||||
return "n/a"
|
||||
return (f"e={m['n_entities']} edge={m['n_edges']} "
|
||||
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
|
||||
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
|
||||
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
|
||||
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(a_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_metrics = None
|
||||
|
||||
# Condition B local metadata pass
|
||||
local_result = call_local_metadata(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']}", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
metadata = parse_metadata(local_raw)
|
||||
# Override LLM-hallucinated char_length with Python-computed truth
|
||||
if metadata is not None and isinstance(metadata, dict):
|
||||
metadata["char_length"] = len(doc_text)
|
||||
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
|
||||
flush=True)
|
||||
|
||||
if metadata is None:
|
||||
print(f" B: metadata parse failed — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "metadata_parse_failed",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
|
||||
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
|
||||
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(b_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_metrics = None
|
||||
|
||||
# Per-doc deltas
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
pred_pct_str = "n/a"
|
||||
if a_metrics and b_metrics:
|
||||
if a_metrics["n_edges"] > 0:
|
||||
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
|
||||
if a_metrics["predicate_diversity"] > 0:
|
||||
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
|
||||
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
|
||||
flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_metadata": metadata,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"metrics": b_metrics,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:32000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("metrics") is not None
|
||||
and r.get("condition_b", {}).get("metrics") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
def avg_metric(rows, condition, key):
|
||||
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
|
||||
return round(statistics.mean(vals), 2) if vals else None
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
|
||||
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
|
||||
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
|
||||
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
|
||||
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
|
||||
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
|
||||
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
|
||||
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
|
||||
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
|
||||
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
|
||||
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
|
||||
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "base_class_test",
|
||||
"title": "Base-Class Enrichment — OOP Framing",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By bucket — graph metrics (A vs B):")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}):")
|
||||
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
|
||||
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
|
||||
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
|
||||
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
|
||||
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
|
||||
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
|
||||
print()
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,593 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base-Class Enrichment Test — OOP Framing Experiment
|
||||
|
||||
Tests whether non-entity metadata from a local model (domain class, structural
|
||||
signals, presence flags, length, summary) can take load off the API without
|
||||
constraining what it extracts.
|
||||
|
||||
The local model does NOT draft entities. The API still does full extraction.
|
||||
The local model produces metadata that orients the API's reading.
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction, no metadata
|
||||
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
|
||||
|
||||
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
|
||||
If B produces fewer edges or less predicate diversity, metadata is acting as
|
||||
constraint and the OOP framing is falsified.
|
||||
|
||||
Sample: 20 docs from briefing_test_v2_results.json:
|
||||
- 5 small (<1000 chars)
|
||||
- 10 medium (1000-5000 chars)
|
||||
- 5 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Outputs: ~/aaronai/experiments/base_class_test_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_test_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 4096
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"language": "en or other",
|
||||
"char_length": integer,
|
||||
"primary_format": "prose, presentation, list, form, code, or mixed",
|
||||
"structural_signals": {
|
||||
"has_headings": boolean,
|
||||
"has_bullet_lists": boolean,
|
||||
"has_numbered_lists": boolean,
|
||||
"has_tables": boolean,
|
||||
"has_code_blocks": boolean,
|
||||
"has_dates": boolean
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": boolean,
|
||||
"has_institutional_language": boolean,
|
||||
"has_technical_terminology": boolean,
|
||||
"has_first_person": boolean,
|
||||
"has_quotations": boolean
|
||||
},
|
||||
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
|
||||
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
|
||||
}
|
||||
|
||||
JSON only, no commentary.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
|
||||
|
||||
DOCUMENT METADATA:
|
||||
{metadata_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local_metadata(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_METADATA_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph_full(raw):
|
||||
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None, False
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None, False
|
||||
if not isinstance(data, dict):
|
||||
return None, None, False
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return ents, edges, True
|
||||
return None, None, False
|
||||
|
||||
|
||||
def parse_metadata(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def graph_metrics(entities, edges):
|
||||
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
|
||||
if entities is None or edges is None:
|
||||
return None
|
||||
n_entities = len(entities)
|
||||
n_edges = len(edges)
|
||||
|
||||
# Predicate diversity
|
||||
predicates = set()
|
||||
for e in edges:
|
||||
if isinstance(e, dict):
|
||||
p = e.get("predicate")
|
||||
if p:
|
||||
predicates.add(str(p).strip().lower())
|
||||
predicate_diversity = len(predicates)
|
||||
|
||||
# Entity type diversity
|
||||
types = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
t = ent.get("type")
|
||||
if t:
|
||||
types.add(str(t).strip().lower())
|
||||
type_diversity = len(types)
|
||||
|
||||
# Average degree (edges*2 / entities — each edge touches two nodes)
|
||||
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
|
||||
|
||||
# Largest connected component
|
||||
# Build adjacency from edges
|
||||
entity_names = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
n = ent.get("name")
|
||||
if n:
|
||||
entity_names.add(str(n).strip().lower())
|
||||
|
||||
adj = {name: set() for name in entity_names}
|
||||
for e in edges:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
s = str(e.get("subject", "")).strip().lower()
|
||||
o = str(e.get("object", "")).strip().lower()
|
||||
if s in adj and o in adj:
|
||||
adj[s].add(o)
|
||||
adj[o].add(s)
|
||||
|
||||
# BFS for largest component
|
||||
visited = set()
|
||||
largest = 0
|
||||
for start in adj:
|
||||
if start in visited:
|
||||
continue
|
||||
component = 0
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in visited:
|
||||
continue
|
||||
visited.add(node)
|
||||
component += 1
|
||||
for neighbor in adj[node]:
|
||||
if neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if component > largest:
|
||||
largest = component
|
||||
|
||||
return {
|
||||
"n_entities": n_entities,
|
||||
"n_edges": n_edges,
|
||||
"predicate_diversity": predicate_diversity,
|
||||
"type_diversity": type_diversity,
|
||||
"avg_degree": round(avg_degree, 2),
|
||||
"largest_component": largest,
|
||||
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000]
|
||||
large = [d for d, n in sized if n >= 5000]
|
||||
return small[:5] + medium[:10] + large[:5]
|
||||
|
||||
|
||||
def fmt_metrics(m):
|
||||
if m is None:
|
||||
return "n/a"
|
||||
return (f"e={m['n_entities']} edge={m['n_edges']} "
|
||||
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
|
||||
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (5s/10m/5l, file order)")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
|
||||
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
|
||||
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(a_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_metrics = None
|
||||
|
||||
# Condition B local metadata pass
|
||||
local_result = call_local_metadata(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']}", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
metadata = parse_metadata(local_raw)
|
||||
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
|
||||
flush=True)
|
||||
|
||||
if metadata is None:
|
||||
print(f" B: metadata parse failed — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "metadata_parse_failed",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
|
||||
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
|
||||
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(b_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_metrics = None
|
||||
|
||||
# Per-doc deltas
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
pred_pct_str = "n/a"
|
||||
if a_metrics and b_metrics:
|
||||
if a_metrics["n_edges"] > 0:
|
||||
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
|
||||
if a_metrics["predicate_diversity"] > 0:
|
||||
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
|
||||
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
|
||||
flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_metadata": metadata,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"metrics": b_metrics,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:4000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("metrics") is not None
|
||||
and r.get("condition_b", {}).get("metrics") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
def avg_metric(rows, condition, key):
|
||||
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
|
||||
return round(statistics.mean(vals), 2) if vals else None
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
|
||||
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
|
||||
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
|
||||
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
|
||||
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
|
||||
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
|
||||
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
|
||||
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
|
||||
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
|
||||
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
|
||||
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
|
||||
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "base_class_test",
|
||||
"title": "Base-Class Enrichment — OOP Framing",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By bucket — graph metrics (A vs B):")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}):")
|
||||
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
|
||||
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
|
||||
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
|
||||
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
|
||||
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
|
||||
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
|
||||
print()
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,611 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base-Class Enrichment Test — OOP Framing Experiment
|
||||
|
||||
Tests whether non-entity metadata from a local model (domain class, structural
|
||||
signals, presence flags, length, summary) can take load off the API without
|
||||
constraining what it extracts.
|
||||
|
||||
The local model does NOT draft entities. The API still does full extraction.
|
||||
The local model produces metadata that orients the API's reading.
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction, no metadata
|
||||
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
|
||||
|
||||
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
|
||||
If B produces fewer edges or less predicate diversity, metadata is acting as
|
||||
constraint and the OOP framing is falsified.
|
||||
|
||||
Sample: 50 docs from briefing_test_v2_results.json:
|
||||
- 15 small (<1000 chars)
|
||||
- 25 medium (1000-5000 chars)
|
||||
- 10 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Outputs: ~/aaronai/experiments/base_class_validation_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_validation_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 8192
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"language": "en or other",
|
||||
"char_length": integer,
|
||||
"primary_format": "prose, presentation, list, form, code, or mixed",
|
||||
"structural_signals": {
|
||||
"has_headings": boolean,
|
||||
"has_bullet_lists": boolean,
|
||||
"has_numbered_lists": boolean,
|
||||
"has_tables": boolean,
|
||||
"has_code_blocks": boolean,
|
||||
"has_dates": boolean
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": boolean,
|
||||
"has_institutional_language": boolean,
|
||||
"has_technical_terminology": boolean,
|
||||
"has_first_person": boolean,
|
||||
"has_quotations": boolean
|
||||
},
|
||||
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
|
||||
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
|
||||
}
|
||||
|
||||
JSON only, no commentary.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
|
||||
|
||||
DOCUMENT METADATA:
|
||||
{metadata_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local_metadata(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_METADATA_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph_full(raw):
|
||||
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None, False
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None, False
|
||||
if not isinstance(data, dict):
|
||||
return None, None, False
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return ents, edges, True
|
||||
return None, None, False
|
||||
|
||||
|
||||
def parse_metadata(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def graph_metrics(entities, edges):
|
||||
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
|
||||
if entities is None or edges is None:
|
||||
return None
|
||||
n_entities = len(entities)
|
||||
n_edges = len(edges)
|
||||
|
||||
# Predicate diversity
|
||||
predicates = set()
|
||||
for e in edges:
|
||||
if isinstance(e, dict):
|
||||
p = e.get("predicate")
|
||||
if p:
|
||||
predicates.add(str(p).strip().lower())
|
||||
predicate_diversity = len(predicates)
|
||||
|
||||
# Entity type diversity
|
||||
types = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
t = ent.get("type")
|
||||
if t:
|
||||
types.add(str(t).strip().lower())
|
||||
type_diversity = len(types)
|
||||
|
||||
# Average degree (edges*2 / entities — each edge touches two nodes)
|
||||
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
|
||||
|
||||
# Largest connected component
|
||||
# Build adjacency from edges
|
||||
entity_names = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
n = ent.get("name")
|
||||
if n:
|
||||
entity_names.add(str(n).strip().lower())
|
||||
|
||||
adj = {name: set() for name in entity_names}
|
||||
for e in edges:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
s = str(e.get("subject", "")).strip().lower()
|
||||
o = str(e.get("object", "")).strip().lower()
|
||||
if s in adj and o in adj:
|
||||
adj[s].add(o)
|
||||
adj[o].add(s)
|
||||
|
||||
# BFS for largest component
|
||||
visited = set()
|
||||
largest = 0
|
||||
for start in adj:
|
||||
if start in visited:
|
||||
continue
|
||||
component = 0
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in visited:
|
||||
continue
|
||||
visited.add(node)
|
||||
component += 1
|
||||
for neighbor in adj[node]:
|
||||
if neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if component > largest:
|
||||
largest = component
|
||||
|
||||
return {
|
||||
"n_entities": n_entities,
|
||||
"n_edges": n_edges,
|
||||
"predicate_diversity": predicate_diversity,
|
||||
"type_diversity": type_diversity,
|
||||
"avg_degree": round(avg_degree, 2),
|
||||
"largest_component": largest,
|
||||
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Pick small + medium from v2; large bucket is loaded separately from
|
||||
large_bucket_sources.json (sampled fresh from pgvector since v2 has no large docs)."""
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000][:15]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000][:25]
|
||||
|
||||
# Load large bucket from external sources file
|
||||
import json as _json
|
||||
large_sources_file = Path.home() / "aaronai" / "large_bucket_sources.json"
|
||||
if large_sources_file.exists():
|
||||
large_source_names = _json.loads(large_sources_file.read_text())
|
||||
# Synthesize doc_meta entries for the large sources
|
||||
large = [{"source": s, "content_length": 0, "status": "SUCCESS"}
|
||||
for s in large_source_names]
|
||||
print(f"Stratify: 15 small + 25 medium from v2, 10 large from large_bucket_sources.json")
|
||||
else:
|
||||
large = []
|
||||
print("WARN: large_bucket_sources.json not found, no large docs in sample")
|
||||
|
||||
return small + medium + large
|
||||
|
||||
|
||||
def fmt_metrics(m):
|
||||
if m is None:
|
||||
return "n/a"
|
||||
return (f"e={m['n_entities']} edge={m['n_edges']} "
|
||||
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
|
||||
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
|
||||
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
|
||||
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(a_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_metrics = None
|
||||
|
||||
# Condition B local metadata pass
|
||||
local_result = call_local_metadata(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']}", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
metadata = parse_metadata(local_raw)
|
||||
# Override LLM-hallucinated char_length with Python-computed truth
|
||||
if metadata is not None and isinstance(metadata, dict):
|
||||
metadata["char_length"] = len(doc_text)
|
||||
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
|
||||
flush=True)
|
||||
|
||||
if metadata is None:
|
||||
print(f" B: metadata parse failed — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "metadata_parse_failed",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
|
||||
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
|
||||
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(b_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_metrics = None
|
||||
|
||||
# Per-doc deltas
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
pred_pct_str = "n/a"
|
||||
if a_metrics and b_metrics:
|
||||
if a_metrics["n_edges"] > 0:
|
||||
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
|
||||
if a_metrics["predicate_diversity"] > 0:
|
||||
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
|
||||
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
|
||||
flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_metadata": metadata,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"metrics": b_metrics,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:32000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("metrics") is not None
|
||||
and r.get("condition_b", {}).get("metrics") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
def avg_metric(rows, condition, key):
|
||||
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
|
||||
return round(statistics.mean(vals), 2) if vals else None
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
|
||||
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
|
||||
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
|
||||
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
|
||||
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
|
||||
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
|
||||
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
|
||||
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
|
||||
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
|
||||
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
|
||||
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
|
||||
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "base_class_test",
|
||||
"title": "Base-Class Enrichment — OOP Framing",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By bucket — graph metrics (A vs B):")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}):")
|
||||
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
|
||||
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
|
||||
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
|
||||
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
|
||||
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
|
||||
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
|
||||
print()
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BirdAI Briefing Generator v2 — Experiment 002b
|
||||
===============================================
|
||||
Changes from v1 (based on Experiment 004 human evaluation):
|
||||
- document_type now pre-classified by rule, not by model
|
||||
- Capture template header stripped before model sees content
|
||||
- noise_signals constrained to controlled vocabulary
|
||||
- Model prompt simplified — focuses only on reliable signal fields
|
||||
- Expanded document type vocabulary for BirdAI-specific types
|
||||
Results written to ~/aaronai/briefing_test_v2_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import hashlib
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_v2_results.json")
|
||||
MODEL = "mistral"
|
||||
SAMPLE_SIZE = 50
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
|
||||
VALID_DOC_TYPES = {
|
||||
"voice_capture", "image_capture",
|
||||
"dream_nrem", "dream_rem", "dream_lucid", "dream_synthesis",
|
||||
"presentation", "code", "spreadsheet",
|
||||
"academic_pdf", "technical_doc", "chat_log",
|
||||
"book_excerpt", "form", "syllabus", "email",
|
||||
"notes", "purchase_order", "annual_report",
|
||||
"invoice", "memo", "report", "unknown"
|
||||
}
|
||||
|
||||
VALID_DENSITIES = {"high", "medium", "low"}
|
||||
VALID_PRIORITIES = {"full", "partial", "skip"}
|
||||
|
||||
VALID_NOISE_SIGNALS = {
|
||||
"repeated_headers", "page_numbers", "formatting_artifacts",
|
||||
"boilerplate", "watermarks", "footers", "line_numbers",
|
||||
"encoding_artifacts", "ocr_errors"
|
||||
}
|
||||
|
||||
VALID_STRUCTURE_SIGNALS = {
|
||||
"headings", "bullet_lists", "numbered_lists", "tables",
|
||||
"code_blocks", "citations", "footnotes", "images",
|
||||
"forms", "columns", "sections"
|
||||
}
|
||||
|
||||
|
||||
def pre_classify_document(source, content):
|
||||
filename = os.path.basename(source).lower()
|
||||
doc_type = None
|
||||
cleaned_content = content
|
||||
|
||||
if "---" in content:
|
||||
parts = content.split("---", 1)
|
||||
header = parts[0].lower()
|
||||
body = parts[1].strip() if len(parts) > 1 else content
|
||||
if any(marker in header for marker in ["**type:**", "**modality:**", "# capture", "# dream"]):
|
||||
cleaned_content = body if body else content
|
||||
|
||||
if "nrem" in filename:
|
||||
doc_type = "dream_nrem"
|
||||
elif "lucid" in filename:
|
||||
doc_type = "dream_lucid"
|
||||
elif "-rem-" in filename or filename.endswith("-rem.md"):
|
||||
doc_type = "dream_rem"
|
||||
elif "synthesis" in filename and filename.endswith(".md"):
|
||||
doc_type = "dream_synthesis"
|
||||
elif "-voice" in filename or "voice-" in filename:
|
||||
doc_type = "voice_capture"
|
||||
elif "-image" in filename or "image-" in filename:
|
||||
doc_type = "image_capture"
|
||||
elif filename.endswith(".pptx") or filename.endswith(".ppt"):
|
||||
doc_type = "presentation"
|
||||
elif filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".csv"):
|
||||
doc_type = "spreadsheet"
|
||||
elif any(filename.endswith(ext) for ext in [".py", ".js", ".ts", ".cpp", ".c", ".h", ".java", ".rs"]):
|
||||
doc_type = "code"
|
||||
elif filename.endswith("cmakelists.txt") or filename == "makefile":
|
||||
doc_type = "code"
|
||||
elif content.startswith("# Dream"):
|
||||
if "nrem" in content[:50].lower():
|
||||
doc_type = "dream_nrem"
|
||||
elif "lucid" in content[:50].lower():
|
||||
doc_type = "dream_lucid"
|
||||
elif "rem" in content[:50].lower():
|
||||
doc_type = "dream_rem"
|
||||
else:
|
||||
doc_type = "dream_synthesis"
|
||||
elif content.startswith("# Capture"):
|
||||
doc_type = "voice_capture" if "voice" in content[:100].lower() else "image_capture"
|
||||
|
||||
return doc_type, cleaned_content
|
||||
|
||||
|
||||
def build_briefing_prompt(content, pre_classified_type=None):
|
||||
if pre_classified_type:
|
||||
type_instruction = f'\n "document_type": "{pre_classified_type}", // pre-classified, do not change'
|
||||
else:
|
||||
type_instruction = '\n "document_type": "one of: academic_pdf, technical_doc, chat_log, book_excerpt, form, syllabus, email, notes, purchase_order, annual_report, invoice, memo, report, unknown",'
|
||||
|
||||
return f"""Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
|
||||
|
||||
Return exactly this structure:
|
||||
{{{type_instruction}
|
||||
"primary_language": "language code e.g. en, fr, de",
|
||||
"density": "one of: high, medium, low",
|
||||
"has_proper_nouns": true or false,
|
||||
"has_dates": true or false,
|
||||
"has_numeric_data": true or false,
|
||||
"has_institutional_language": true or false,
|
||||
"has_technical_terms": true or false,
|
||||
"likely_has_named_entities": true or false,
|
||||
"structure_signals": [],
|
||||
"noise_signals": [],
|
||||
"extraction_priority": "one of: full, partial, skip"
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- density: high=information dense technical or academic, medium=mixed, low=narrative/literary/sparse/short
|
||||
- has_proper_nouns: true if you see capitalized words that are NOT sentence starts or template headers
|
||||
- has_dates: true if you see date patterns (numbers with months, years, slashes)
|
||||
- has_numeric_data: true if you see measurements, percentages, statistics
|
||||
- has_institutional_language: true if you see words like university, department, policy, committee, grant
|
||||
- has_technical_terms: true if you see domain-specific jargon or acronyms
|
||||
- likely_has_named_entities: true if has_proper_nouns is true
|
||||
- structure_signals: use ONLY these terms: headings, bullet_lists, numbered_lists, tables, code_blocks, citations, footnotes, images, forms, columns, sections
|
||||
- noise_signals: use ONLY these terms: repeated_headers, page_numbers, formatting_artifacts, boilerplate, watermarks, footers, line_numbers, encoding_artifacts, ocr_errors
|
||||
- extraction_priority: full if density=high and likely_has_named_entities=true; skip if density=low AND likely_has_named_entities=false AND content is under 200 words; partial otherwise
|
||||
|
||||
Document:
|
||||
{content[:1500]}"""
|
||||
|
||||
|
||||
def get_sample_documents():
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ON (source) id, document, source, created_at
|
||||
FROM embeddings
|
||||
WHERE length(document) > 100
|
||||
AND length(document) < 3000
|
||||
ORDER BY source, random()
|
||||
LIMIT %s
|
||||
""", (SAMPLE_SIZE,))
|
||||
docs = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return docs
|
||||
|
||||
|
||||
def run_briefing(prompt):
|
||||
payload = json.dumps({"model": MODEL, "prompt": prompt, "stream": False}).encode()
|
||||
raw = ""
|
||||
try:
|
||||
req = urllib.request.Request(OLLAMA_URL, data=payload, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
result = json.loads(resp.read().decode())
|
||||
raw = result.get("response", "").strip()
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}") + 1
|
||||
if start == -1 or end == 0:
|
||||
return None, f"NO_JSON: {raw[:200]}"
|
||||
parsed = json.loads(raw[start:end])
|
||||
if not isinstance(parsed, dict):
|
||||
return None, f"NOT_DICT: {raw[:100]}"
|
||||
return parsed, raw
|
||||
except urllib.error.URLError as e:
|
||||
return None, f"URL_ERROR: {e}"
|
||||
except TimeoutError:
|
||||
return None, "TIMEOUT"
|
||||
except json.JSONDecodeError as e:
|
||||
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
|
||||
except Exception as e:
|
||||
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def sanitize_briefing(briefing, pre_classified_type=None):
|
||||
safe = {}
|
||||
if pre_classified_type:
|
||||
safe["document_type"] = pre_classified_type
|
||||
else:
|
||||
dt = str(briefing.get("document_type", "unknown")).lower().strip()
|
||||
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
|
||||
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
|
||||
density = str(briefing.get("density", "medium")).lower().strip()
|
||||
safe["density"] = density if density in VALID_DENSITIES else "medium"
|
||||
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
|
||||
"has_institutional_language", "has_technical_terms", "likely_has_named_entities"]:
|
||||
val = briefing.get(field, False)
|
||||
if isinstance(val, bool):
|
||||
safe[field] = val
|
||||
elif isinstance(val, str):
|
||||
safe[field] = val.lower() in ("true", "yes", "1")
|
||||
else:
|
||||
safe[field] = bool(val)
|
||||
for field, valid_set in [("structure_signals", VALID_STRUCTURE_SIGNALS),
|
||||
("noise_signals", VALID_NOISE_SIGNALS)]:
|
||||
val = briefing.get(field, [])
|
||||
if isinstance(val, list):
|
||||
safe[field] = [str(v).lower().strip() for v in val if str(v).lower().strip() in valid_set]
|
||||
elif isinstance(val, str) and val.lower().strip() in valid_set:
|
||||
safe[field] = [val.lower().strip()]
|
||||
else:
|
||||
safe[field] = []
|
||||
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
|
||||
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
|
||||
return safe
|
||||
|
||||
|
||||
def estimate_token_reduction(original_text, briefing):
|
||||
original_tokens = max(len(original_text) / 4, 1)
|
||||
orientation_saved = 200
|
||||
if briefing.get("extraction_priority") == "skip":
|
||||
return {"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": round(original_tokens + 200),
|
||||
"noise_reduction_pct": 100.0, "total_reduction_pct": 100.0,
|
||||
"note": "skip — no API call"}
|
||||
noise_count = len(briefing.get("noise_signals", []))
|
||||
noise_reduction_pct = min(noise_count * 0.05, 0.40)
|
||||
noise_tokens_saved = original_tokens * noise_reduction_pct
|
||||
total_saved = orientation_saved + noise_tokens_saved
|
||||
reduction_pct = min((total_saved / (original_tokens + 200)) * 100, 99.0)
|
||||
return {"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": orientation_saved,
|
||||
"noise_tokens_saved": round(noise_tokens_saved),
|
||||
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
|
||||
"total_reduction_pct": round(reduction_pct, 1)}
|
||||
|
||||
|
||||
def format_eta(elapsed_times, completed, total):
|
||||
if completed == 0:
|
||||
return "ETA: --:--"
|
||||
avg = sum(elapsed_times) / completed
|
||||
eta = timedelta(seconds=int((total - completed) * avg))
|
||||
return f"ETA: {str(eta)}"
|
||||
|
||||
|
||||
def content_hash(text):
|
||||
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||
|
||||
|
||||
def main():
|
||||
test_start = time.time()
|
||||
print(f"\nBirdAI Briefing Generator v2 — Experiment 002b")
|
||||
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
|
||||
print(f"Changes: rule-based doc_type, template stripping, controlled vocab")
|
||||
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Results: {RESULTS_FILE}")
|
||||
print("-" * 75)
|
||||
|
||||
docs = get_sample_documents()
|
||||
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
|
||||
|
||||
results = {
|
||||
"meta": {"model": MODEL, "version": "v2", "sample_size": len(docs),
|
||||
"started": datetime.now().isoformat(), "completed": None,
|
||||
"total_elapsed_seconds": None, "avg_seconds_per_doc": None},
|
||||
"documents": [], "summary": {}
|
||||
}
|
||||
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
pre_classified_count = 0
|
||||
priority_counts = {"full": 0, "partial": 0, "skip": 0}
|
||||
total_reduction_pct = 0.0
|
||||
elapsed_times = []
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
doc_id = doc["id"]
|
||||
content = doc["document"]
|
||||
source = doc.get("source", "unknown")
|
||||
chash = content_hash(content)
|
||||
|
||||
pre_type, cleaned_content = pre_classify_document(source, content)
|
||||
was_pre_classified = pre_type is not None
|
||||
if was_pre_classified:
|
||||
pre_classified_count += 1
|
||||
|
||||
eta_str = format_eta(elapsed_times, i, len(docs))
|
||||
pre_flag = "R" if was_pre_classified else "M"
|
||||
print(f"[{i+1:02d}/{len(docs)}][{pre_flag}] {source[:36]:<36} {eta_str:<14}", end=" ", flush=True)
|
||||
|
||||
prompt = build_briefing_prompt(cleaned_content, pre_type)
|
||||
t_start = time.time()
|
||||
briefing, raw = run_briefing(prompt)
|
||||
elapsed = round(time.time() - t_start, 1)
|
||||
elapsed_times.append(elapsed)
|
||||
|
||||
if briefing is None:
|
||||
failed_count += 1
|
||||
print(f"→ FAILED {elapsed}s | {raw[:50]}")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "status": "FAILED",
|
||||
"pre_classified_type": pre_type, "error": raw, "elapsed_seconds": elapsed
|
||||
})
|
||||
else:
|
||||
briefing = sanitize_briefing(briefing, pre_type)
|
||||
success_count += 1
|
||||
priority = briefing["extraction_priority"]
|
||||
doc_type = briefing["document_type"]
|
||||
density = briefing["density"]
|
||||
priority_counts[priority] = priority_counts.get(priority, 0) + 1
|
||||
reduction = estimate_token_reduction(cleaned_content, briefing)
|
||||
total_reduction_pct += reduction["total_reduction_pct"]
|
||||
print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "cleaned_content_length": len(cleaned_content),
|
||||
"status": "SUCCESS", "pre_classified_type": pre_type,
|
||||
"was_pre_classified": was_pre_classified, "elapsed_seconds": elapsed,
|
||||
"briefing": briefing, "token_reduction_estimate": reduction
|
||||
})
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
total_elapsed = round(time.time() - test_start, 1)
|
||||
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
|
||||
completed_at = datetime.now().isoformat()
|
||||
results["meta"]["completed"] = completed_at
|
||||
results["meta"]["total_elapsed_seconds"] = total_elapsed
|
||||
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
|
||||
|
||||
total = len(docs)
|
||||
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
|
||||
summary = {
|
||||
"total": total, "success": success_count, "failed": failed_count,
|
||||
"success_rate": round(success_count / total * 100, 1),
|
||||
"pre_classified_by_rule": pre_classified_count,
|
||||
"classified_by_model": total - pre_classified_count,
|
||||
"extraction_priority_breakdown": priority_counts,
|
||||
"avg_token_reduction_pct": avg_reduction,
|
||||
"total_elapsed_seconds": total_elapsed, "avg_seconds_per_doc": avg_per_doc,
|
||||
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
|
||||
"approach_viable": success_count / total >= 0.8
|
||||
}
|
||||
results["summary"] = summary
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
print("\n" + "=" * 75)
|
||||
print(f"RESULTS — Briefing Generator v2")
|
||||
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
|
||||
print(f" Failed: {failed_count}")
|
||||
print(f" Pre-classified (rule): {pre_classified_count}")
|
||||
print(f" Classified (model): {total - pre_classified_count}")
|
||||
print(f" Priority — full: {priority_counts.get('full', 0)}")
|
||||
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
|
||||
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
|
||||
print(f" Avg token reduction: {avg_reduction}%")
|
||||
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
|
||||
print(f" Avg per document: {avg_per_doc}s")
|
||||
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
|
||||
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
|
||||
print(f" Completed: {completed_at}")
|
||||
print(f" Full results: {RESULTS_FILE}")
|
||||
print("=" * 75)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,508 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cascade Optimization Test — skip-small + compressed-draft
|
||||
|
||||
Tests whether two optimizations on the entity-drafter cascade meaningfully
|
||||
improve the savings ceiling beyond the prior unoptimized cascade (12.66%).
|
||||
|
||||
Optimizations:
|
||||
A — Skip-small-docs routing: docs <1000 chars bypass the local pass entirely
|
||||
B — Compressed draft format: bare JSON array instead of markdown bullets
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction (unchanged from prior)
|
||||
B — Optimized cascade: skip-small + compressed draft, otherwise same cascade
|
||||
|
||||
Sample: 30 docs from briefing_test_v2_results.json:
|
||||
- 10 small (<1000 chars) — should show 0% delta if skip-small works
|
||||
- 12 medium (1000-5000 chars) — primary test bucket
|
||||
- 8 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Mistral context: 12K (raised from 8K in prior run).
|
||||
|
||||
Outputs: ~/aaronai/experiments/cascade_optimization_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_optimization_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 4096
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180 # raised — 12K context can take longer
|
||||
MAX_DOC_CHARS = 12000 # raised from 8K
|
||||
SKIP_SMALL_THRESHOLD = 1000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{
|
||||
"candidates": [string]
|
||||
}
|
||||
|
||||
Just names. No types, no relationships. JSON only.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
# Compressed draft format — bare JSON array, minimal preamble
|
||||
CONDITION_B_API_PROMPT_COMPRESSED = """Extract a knowledge graph from the document below.
|
||||
|
||||
Local model entity candidates (hint, not authoritative — verify against the document, ignore false ones, add missed ones):
|
||||
{local_draft_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None
|
||||
if not isinstance(data, dict):
|
||||
return None, None
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return len(ents), len(edges)
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_candidates(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
cands = data.get("candidates")
|
||||
if isinstance(cands, list):
|
||||
return [str(c).strip() for c in cands if c]
|
||||
return None
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Pick 10 small / 12 medium / 8 large by character length, in file order."""
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000]
|
||||
large = [d for d, n in sized if n >= 5000]
|
||||
return small[:10] + medium[:12] + large[:8]
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (10s/12m/8l, file order)")
|
||||
print(f"Skip-small threshold: <{SKIP_SMALL_THRESHOLD} chars")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
skip_small_routed = sent_len < SKIP_SMALL_THRESHOLD
|
||||
trunc_marker = "*" if truncated else " "
|
||||
route_marker = "[skip-small]" if skip_small_routed else "[cascade] "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] "
|
||||
f"{route_marker} {source[:50]}", flush=True)
|
||||
|
||||
# Condition A — always runs
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges = parse_graph(a["response_text"])
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_ents = a_edges = None
|
||||
|
||||
# Condition B
|
||||
if skip_small_routed:
|
||||
# Skip-small: B = A. Same call, no local pass.
|
||||
print(f" B: routed to baseline (skip-small)", flush=True)
|
||||
b = a
|
||||
b_ents = a_ents
|
||||
b_edges = a_edges
|
||||
local_result = {"skipped": "skip_small_routed"}
|
||||
local_candidates = []
|
||||
local_raw = ""
|
||||
else:
|
||||
local_result = call_local(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']} — recording skip", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"skip_small_routed": False,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
cands = parse_candidates(local_raw)
|
||||
local_candidates = cands or []
|
||||
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
|
||||
flush=True)
|
||||
|
||||
if not local_candidates:
|
||||
print(f" B local: empty draft — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"skip_small_routed": False,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_draft_empty",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
# Compressed draft format — bare JSON array
|
||||
local_draft_json = json.dumps(local_candidates, ensure_ascii=False)
|
||||
b_prompt = CONDITION_B_API_PROMPT_COMPRESSED.replace("{local_draft_json}", local_draft_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges = parse_graph(b["response_text"])
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_ents = b_edges = None
|
||||
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
if a_edges and b_edges is not None and a_edges > 0:
|
||||
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
|
||||
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"skip_small_routed": skip_small_routed,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skip_small_routed": skip_small_routed,
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_candidates": local_candidates,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"entity_count": b_ents,
|
||||
"edge_count": b_edges,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:4000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("input_tokens") is not None
|
||||
and r.get("condition_b", {}).get("api_input_tokens") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
|
||||
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
|
||||
skip_count = sum(1 for r in rows if r.get("skip_small_routed"))
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"n_skip_small_routed": skip_count,
|
||||
"n_cascade": len(rows) - skip_count,
|
||||
"a_input_tokens": ai,
|
||||
"a_output_tokens": ao,
|
||||
"b_input_tokens": bi,
|
||||
"b_output_tokens": bo,
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
|
||||
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "cascade_optimization_test",
|
||||
"title": "Cascade Optimization — skip-small + compressed-draft",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"haiku_temperature": HAIKU_TEMPERATURE,
|
||||
"haiku_max_tokens": HAIKU_MAX_TOKENS,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"skip_small_threshold": SKIP_SMALL_THRESHOLD,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"n_skipped": len(sample) - len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"prior_unoptimized_cascade_pct": -12.66,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
opt_delta = delta_pct - (-12.66)
|
||||
print(f"Optimization delta vs prior cascade: {opt_delta:+.2f} points "
|
||||
f"(prior was -12.66%)")
|
||||
print()
|
||||
print("By size bucket:")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}, skip={stats['n_skip_small_routed']}): "
|
||||
f"in {stats['input_delta_pct']:+.1f}% "
|
||||
f"out {stats['output_delta_pct']:+.1f}% "
|
||||
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print()
|
||||
print("Results: " + str(OUTPUT_FILE))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,485 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cascade Test — Nodes-vs-Edges Experiment
|
||||
|
||||
Tests whether splitting graph extraction into "local drafts entity candidates,
|
||||
API verifies + draws edges" reduces total API cost vs single-shot full
|
||||
extraction, while producing a comparable graph.
|
||||
|
||||
Two conditions per document:
|
||||
A — Baseline: single Claude Haiku call, full extraction
|
||||
B — Cascade: Mistral lists entity candidates, then Haiku does verify+edges
|
||||
|
||||
Both conditions:
|
||||
- See the full document (parity-respecting)
|
||||
- Use open entity type vocabulary (no fixed schema)
|
||||
- Use natural-language predicates (no constrained relations)
|
||||
- Same target output schema, same temperature
|
||||
|
||||
Sample: 20 docs from briefing_test_v2_results.json, stratified by char length.
|
||||
Reports API cost only. Local Mistral time is recorded but not monetized
|
||||
(ran on the VPS, no per-token API charge).
|
||||
|
||||
Outputs: ~/aaronai/experiments/cascade_test_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_test_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 4096
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 120
|
||||
MAX_DOC_CHARS = 8000
|
||||
|
||||
# Verified pricing 2026-04-28 against Anthropic docs
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{
|
||||
"candidates": [string]
|
||||
}
|
||||
|
||||
Just names. No types, no relationships. JSON only.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT_WITH_DRAFT = """Extract a knowledge graph from the document below.
|
||||
|
||||
A local model has identified entity candidates that may help orient your reading. Treat the candidates as a hint, not as truth — verify each candidate appears in the document, ignore any that do not, and add any entities the candidates missed.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
|
||||
|
||||
ENTITY CANDIDATES FROM LOCAL MODEL:
|
||||
{local_draft}
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 8192},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None
|
||||
if not isinstance(data, dict):
|
||||
return None, None
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return len(ents), len(edges)
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_candidates(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
cands = data.get("candidates")
|
||||
if isinstance(cands, list):
|
||||
return [str(c).strip() for c in cands if c]
|
||||
return None
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Pick 5 small / 10 medium / 5 large by character length, in file order."""
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000]
|
||||
large = [d for d, n in sized if n >= 5000]
|
||||
return small[:5] + medium[:10] + large[:5]
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (stratified by char length, file order)")
|
||||
for d in sample:
|
||||
print(f" [{d['content_length']:>6}c] {d['source'][:60]}")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
|
||||
print(f"Local model: {LOCAL_MODEL}")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:60]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges = parse_graph(a["response_text"])
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_ents = a_edges = None
|
||||
|
||||
# Condition B local pass
|
||||
local_result = call_local(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']} — skipping doc", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
cands = parse_candidates(local_raw)
|
||||
local_candidates = cands or []
|
||||
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
|
||||
flush=True)
|
||||
|
||||
if not local_candidates:
|
||||
print(f" B local: empty draft — skipping API call to avoid asymmetric test", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_draft_empty",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_draft_str = "\n".join(f"- {c}" for c in local_candidates)
|
||||
b_prompt = CONDITION_B_API_PROMPT_WITH_DRAFT.replace("{local_draft}", local_draft_str) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges = parse_graph(b["response_text"])
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_ents = b_edges = None
|
||||
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
if a_edges and b_edges is not None and a_edges > 0:
|
||||
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
|
||||
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_candidates": local_candidates,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"entity_count": b_ents,
|
||||
"edge_count": b_edges,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:4000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("input_tokens") is not None
|
||||
and r.get("condition_b", {}).get("api_input_tokens") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
|
||||
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"a_input_tokens": ai,
|
||||
"a_output_tokens": ao,
|
||||
"b_input_tokens": bi,
|
||||
"b_output_tokens": bo,
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
|
||||
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "cascade_test",
|
||||
"title": "Nodes-vs-Edges Cascade Experiment",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"haiku_temperature": HAIKU_TEMPERATURE,
|
||||
"haiku_max_tokens": HAIKU_MAX_TOKENS,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"n_skipped": len(sample) - len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By size bucket:")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}): "
|
||||
f"in {stats['input_delta_pct']:+.1f}% "
|
||||
f"out {stats['output_delta_pct']:+.1f}% "
|
||||
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print()
|
||||
print(f"NOTE: API cost only. Local Mistral runtime is not monetized.")
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Experiment 003 — Entity-Only Consistency Test
|
||||
|
||||
Three Mistral passes per document, measure consistency on entity fields only
|
||||
(people, organizations, locations, dates). Excludes document_type label.
|
||||
DISTINCT ON (source) sampling — fixes Exp 001 chunk-replacement flaw.
|
||||
|
||||
Outputs: ~/aaronai/experiments/consistency_test_v2_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "consistency_test_v2_results.json"
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
MODEL = "mistral"
|
||||
N_PASSES = 3
|
||||
N_DOCS = 50
|
||||
PER_CALL_TIMEOUT = 60 # seconds — fail fast, don't wedge
|
||||
MAX_DOC_CHARS = 8000 # cap document length sent to Mistral
|
||||
|
||||
EXTRACTION_PROMPT = """Extract entities from the document below. Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"people": [string],
|
||||
"organizations": [string],
|
||||
"locations": [string],
|
||||
"dates": [string]
|
||||
}
|
||||
Rules:
|
||||
- Only include entities you are CERTAIN about. If uncertain, omit.
|
||||
- No prose, no markdown fences, no commentary. JSON only.
|
||||
- Empty arrays are valid.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def call_mistral(document_text):
|
||||
truncated = document_text[:MAX_DOC_CHARS]
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": MODEL,
|
||||
"prompt": EXTRACTION_PROMPT + truncated,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 512},
|
||||
},
|
||||
timeout=PER_CALL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"truncated": len(document_text) > MAX_DOC_CHARS,
|
||||
}
|
||||
except requests.exceptions.Timeout:
|
||||
return {"error": f"timeout after {PER_CALL_TIMEOUT}s", "latency_s": PER_CALL_TIMEOUT}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_entities(raw_response):
|
||||
text = (raw_response or "").strip()
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"\s*```$", "", text)
|
||||
try:
|
||||
data = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
out = {}
|
||||
for key in ("people", "organizations", "locations", "dates"):
|
||||
vals = data.get(key, [])
|
||||
if not isinstance(vals, list):
|
||||
return None
|
||||
out[key] = sorted(set(str(v).strip().lower() for v in vals if v))
|
||||
return out
|
||||
|
||||
|
||||
def entities_match(a, b):
|
||||
if a is None or b is None:
|
||||
return False
|
||||
return all(a[k] == b[k] for k in ("people", "organizations", "locations", "dates"))
|
||||
|
||||
|
||||
def fetch_distinct_sources(pg_conn, n):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, string_agg(document, E'\n\n' ORDER BY id) AS doc
|
||||
FROM embeddings
|
||||
WHERE source IS NOT NULL
|
||||
GROUP BY source
|
||||
ORDER BY MIN(id)
|
||||
LIMIT %s
|
||||
""", (n,))
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
return [(s, d) for s, d in rows if d and len(d.strip()) > 50]
|
||||
|
||||
|
||||
def main():
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not pg_dsn:
|
||||
print("ERROR: PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
docs = fetch_distinct_sources(pg_conn, N_DOCS)
|
||||
pg_conn.close()
|
||||
|
||||
print(f"Loaded {len(docs)} distinct sources from pgvector")
|
||||
print(f"Model: {MODEL} | Passes per doc: {N_PASSES}")
|
||||
print(f"Per-call timeout: {PER_CALL_TIMEOUT}s | Max doc chars: {MAX_DOC_CHARS}")
|
||||
print(f"Calls planned: {len(docs) * N_PASSES}\n")
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, (source, doc_text) in enumerate(docs, 1):
|
||||
size_marker = f"[{len(doc_text):>5}c]"
|
||||
print(f"[{i:02d}/{len(docs)}] {size_marker} {source[:55]}", flush=True)
|
||||
passes = []
|
||||
for p in range(N_PASSES):
|
||||
r = call_mistral(doc_text)
|
||||
if "error" in r:
|
||||
print(f" pass {p+1}: {r['error']}", flush=True)
|
||||
passes.append({"error": r["error"], "parsed_ok": False, "latency_s": r["latency_s"]})
|
||||
else:
|
||||
entities = parse_entities(r["response"])
|
||||
passes.append({
|
||||
"raw": r["response"][:500],
|
||||
"entities": entities,
|
||||
"latency_s": r["latency_s"],
|
||||
"parsed_ok": entities is not None,
|
||||
"truncated_input": r.get("truncated", False),
|
||||
})
|
||||
|
||||
all_parsed = all(p.get("parsed_ok") for p in passes)
|
||||
if all_parsed:
|
||||
e1, e2, e3 = passes[0]["entities"], passes[1]["entities"], passes[2]["entities"]
|
||||
consistent = entities_match(e1, e2) and entities_match(e2, e3)
|
||||
per_field = {
|
||||
k: (e1[k] == e2[k] == e3[k])
|
||||
for k in ("people", "organizations", "locations", "dates")
|
||||
}
|
||||
else:
|
||||
consistent = False
|
||||
per_field = None
|
||||
|
||||
latencies = [p.get("latency_s", 0) for p in passes]
|
||||
print(f" parsed={all_parsed} consistent={consistent} latencies={latencies}", flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"doc_chars": len(doc_text),
|
||||
"passes": passes,
|
||||
"all_parsed": all_parsed,
|
||||
"consistent": consistent,
|
||||
"per_field_consistency": per_field,
|
||||
})
|
||||
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
parsed = [r for r in results if r["all_parsed"]]
|
||||
consistent = [r for r in parsed if r["consistent"]]
|
||||
|
||||
field_rates = {k: 0 for k in ("people", "organizations", "locations", "dates")}
|
||||
for r in parsed:
|
||||
for k, v in (r["per_field_consistency"] or {}).items():
|
||||
if v:
|
||||
field_rates[k] += 1
|
||||
field_rates_pct = {
|
||||
k: round(100 * v / len(parsed), 1) if parsed else 0.0
|
||||
for k, v in field_rates.items()
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "003",
|
||||
"title": "Entity-Only Consistency Test",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"model": MODEL,
|
||||
"n_passes": N_PASSES,
|
||||
"per_call_timeout_s": PER_CALL_TIMEOUT,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(docs),
|
||||
"n_all_parsed": len(parsed),
|
||||
"n_fully_consistent": len(consistent),
|
||||
"consistency_rate_pct": round(100 * len(consistent) / len(docs), 2) if docs else 0.0,
|
||||
"consistency_rate_among_parsed_pct": (
|
||||
round(100 * len(consistent) / len(parsed), 2) if parsed else 0.0
|
||||
),
|
||||
"per_field_consistency_pct": field_rates_pct,
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"exp_001_baseline_pct": 18.0,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(docs)} docs in {total_elapsed}s")
|
||||
print(f"All 3 passes parsed cleanly: {len(parsed)}/{len(docs)}")
|
||||
print(f"Fully consistent (all 4 fields match): {len(consistent)}/{len(docs)} ({summary['consistency_rate_pct']}%)")
|
||||
print(f"Among parsed only: {summary['consistency_rate_among_parsed_pct']}%")
|
||||
print(f"Per-field consistency: {field_rates_pct}")
|
||||
print(f"Exp 001 baseline: 18% | delta: {summary['consistency_rate_pct'] - 18.0:+.2f} pts")
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,551 @@
|
||||
"""
|
||||
Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
|
||||
|
||||
Reads entities from FalkorDB group_id 'aaron', infers light type labels,
|
||||
computes pairwise similarity within type blocks using ego summary embedding +
|
||||
name string distance + neighbor pattern overlap, generates merge proposals
|
||||
above threshold, writes proposal log for human review.
|
||||
|
||||
Does NOT execute merges. 0.1 is the calibration phase — proposals only,
|
||||
human reviews before any action.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from falkordb import FalkorDB
|
||||
import numpy as np
|
||||
|
||||
# Configuration
|
||||
GROUP_ID = "aaron"
|
||||
HIGH_CONFIDENCE_THRESHOLD = 0.85 # propose merge above this
|
||||
LOW_CONFIDENCE_THRESHOLD = 0.65 # log as low-confidence below
|
||||
PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
|
||||
PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
"""Cosine similarity between two embedding vectors."""
|
||||
a = np.array(a, dtype=np.float32)
|
||||
b = np.array(b, dtype=np.float32)
|
||||
na = np.linalg.norm(a)
|
||||
nb = np.linalg.norm(b)
|
||||
if na == 0 or nb == 0:
|
||||
return 0.0
|
||||
return float(np.dot(a, b) / (na * nb))
|
||||
|
||||
|
||||
def name_similarity(name_a, name_b):
|
||||
"""
|
||||
Token-overlap-based name similarity.
|
||||
Handles formal/informal pairs (Aaron / Aaron Nelson),
|
||||
abbreviation pairs (HVAMC / Hudson Valley AMC),
|
||||
and simple transcription noise.
|
||||
"""
|
||||
a_lower = name_a.lower().strip()
|
||||
b_lower = name_b.lower().strip()
|
||||
|
||||
if a_lower == b_lower:
|
||||
return 1.0
|
||||
|
||||
# Tokenize
|
||||
a_tokens = set(re.findall(r'\b\w+\b', a_lower))
|
||||
b_tokens = set(re.findall(r'\b\w+\b', b_lower))
|
||||
|
||||
if not a_tokens or not b_tokens:
|
||||
return 0.0
|
||||
|
||||
# Substring containment (handles "Aaron" in "Aaron Nelson")
|
||||
if a_lower in b_lower or b_lower in a_lower:
|
||||
# Strong signal but not 1.0 — different lengths
|
||||
shorter = min(len(a_lower), len(b_lower))
|
||||
longer = max(len(a_lower), len(b_lower))
|
||||
return 0.7 + 0.2 * (shorter / longer)
|
||||
|
||||
# Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
|
||||
intersection = a_tokens & b_tokens
|
||||
union = a_tokens | b_tokens
|
||||
jaccard = len(intersection) / len(union)
|
||||
|
||||
# Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
|
||||
def is_acronym(short, full):
|
||||
if len(short) >= len(full):
|
||||
return False
|
||||
if not short.isupper():
|
||||
short_upper = short.upper()
|
||||
else:
|
||||
short_upper = short
|
||||
full_words = full.split()
|
||||
if len(full_words) < 2:
|
||||
return False
|
||||
first_letters = ''.join(w[0].upper() for w in full_words if w)
|
||||
return short_upper == first_letters or short_upper in first_letters
|
||||
|
||||
if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
|
||||
return 0.85
|
||||
|
||||
return jaccard
|
||||
|
||||
|
||||
def infer_type(entity_name, summary):
|
||||
"""
|
||||
Light type inference for blocking. Heuristic-based, transparent.
|
||||
Returns one of: person, organization, project, place, concept, unknown.
|
||||
|
||||
NOT a precise classification — just enough to avoid obviously wrong
|
||||
cross-type comparisons (person vs project). When in doubt, return
|
||||
'unknown' which gets compared against everything.
|
||||
"""
|
||||
name_lower = entity_name.lower().strip()
|
||||
summary_lower = (summary or "").lower()
|
||||
|
||||
# Person: name patterns
|
||||
person_indicators = [
|
||||
# First+Last name pattern (two title-cased words, no other tokens)
|
||||
bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
|
||||
# Single name that's also in the summary as a person
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a person', 'is a professor', 'is an artist', 'is a colleague',
|
||||
'is a friend', 'is a family member', 'works at', 'studied at',
|
||||
"'s spouse", "'s child", "'s parent", "'s student",
|
||||
]),
|
||||
]
|
||||
if any(person_indicators):
|
||||
return "person"
|
||||
|
||||
# Organization: company/institution indicators
|
||||
org_indicators = [
|
||||
any(suffix in name_lower for suffix in [
|
||||
' inc', ' llc', ' corp', ' company', ' university', ' college',
|
||||
' school', ' institute', ' foundation', ' department',
|
||||
]),
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a company', 'is a university', 'is an organization',
|
||||
'is an institution', 'is a department', 'is a nonprofit',
|
||||
]),
|
||||
]
|
||||
if any(org_indicators):
|
||||
return "organization"
|
||||
|
||||
# Project: software/creative work indicators
|
||||
project_indicators = [
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a project', 'software project', 'is a codebase',
|
||||
'is a tool', 'is a system', 'is an application',
|
||||
'is a research project', 'is a design project',
|
||||
]),
|
||||
any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
|
||||
]
|
||||
if any(project_indicators):
|
||||
return "project"
|
||||
|
||||
# Place: location indicators
|
||||
place_indicators = [
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a city', 'is a town', 'is a state', 'is a country',
|
||||
'is a neighborhood', 'is a region', 'is a location',
|
||||
]),
|
||||
]
|
||||
if any(place_indicators):
|
||||
return "place"
|
||||
|
||||
# Default
|
||||
return "unknown"
|
||||
|
||||
|
||||
def get_neighbors(graph, entity_uuid, limit=20):
|
||||
"""Get the names of entities connected to this entity (1-hop)."""
|
||||
query = """
|
||||
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
|
||||
RETURN DISTINCT other.name AS name
|
||||
LIMIT $limit
|
||||
"""
|
||||
result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
|
||||
return set(row[0] for row in result.result_set if row[0])
|
||||
|
||||
|
||||
def neighbor_jaccard(neighbors_a, neighbors_b):
|
||||
"""
|
||||
Asymmetric neighbor overlap (containment metric).
|
||||
|
||||
Returns |A ∩ B| / min(|A|, |B|) — the fraction of the smaller entity's
|
||||
neighbors that are also neighbors of the larger entity.
|
||||
|
||||
Asymmetric is the right metric for personal cognitive corpora, where
|
||||
one entity (e.g., the user) is a hub with hundreds of edges and alias
|
||||
candidates are smaller subset entities. Jaccard penalizes this
|
||||
asymmetry as if it were dissimilarity; containment reveals it as the
|
||||
subset relationship it is.
|
||||
|
||||
DEG-RAG used Jaccard because their academic-corpus entities are
|
||||
roughly comparable in connectivity. Personal corpora have different
|
||||
topology and need a different metric.
|
||||
"""
|
||||
if not neighbors_a and not neighbors_b:
|
||||
return 0.0
|
||||
intersection = neighbors_a & neighbors_b
|
||||
smaller = min(len(neighbors_a), len(neighbors_b))
|
||||
if smaller == 0:
|
||||
return 0.0
|
||||
return len(intersection) / smaller
|
||||
|
||||
|
||||
def get_edge_count(graph, entity_uuid):
|
||||
query = """
|
||||
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
|
||||
RETURN count(r) AS c
|
||||
"""
|
||||
result = graph.query(query, {"uuid": entity_uuid})
|
||||
return result.result_set[0][0] if result.result_set else 0
|
||||
|
||||
|
||||
def combine_signals(name_sim, ego_sim, neighbor_sim):
|
||||
"""
|
||||
Combine the three similarity signals into a single confidence score.
|
||||
|
||||
Weighting tuned for personal cognitive corpora:
|
||||
- Summary embedding ego similarity is primary signal
|
||||
- Containment-based neighbor overlap is strong secondary (catches Aaron+Nelson
|
||||
where the smaller entity's neighbors are mostly a subset of the hub's)
|
||||
- Name similarity is tie-breaker (handles acronyms via name_similarity helper)
|
||||
|
||||
Different from DEG-RAG defaults because personal corpora have asymmetric
|
||||
topology (hub user, subset alias entities).
|
||||
"""
|
||||
# Strong neighbor containment alone is meaningful — if entity B's neighbors
|
||||
# are mostly contained in entity A's, even with different names and weak
|
||||
# name_embedding similarity, that's the asymmetric alias case (Aaron+Nelson).
|
||||
# Require some ego support but not high.
|
||||
if neighbor_sim >= 0.7 and ego_sim >= 0.3:
|
||||
return 0.4 * neighbor_sim + 0.4 * ego_sim + 0.2 * name_sim
|
||||
|
||||
# If ego is very low AND neighbor overlap is weak, probably not aliases
|
||||
if ego_sim < 0.3 and neighbor_sim < 0.4:
|
||||
return min(0.4, max(ego_sim, neighbor_sim))
|
||||
|
||||
# If name is very similar AND ego is at least moderate, high confidence
|
||||
if name_sim >= 0.85 and ego_sim >= 0.5:
|
||||
return 0.4 * ego_sim + 0.4 * name_sim + 0.2 * neighbor_sim
|
||||
|
||||
# Standard weighted average — ego primary, neighbor and name balanced
|
||||
return 0.45 * ego_sim + 0.3 * neighbor_sim + 0.25 * name_sim
|
||||
|
||||
|
||||
def compute_summary_embedding(text, model="nomic-embed-text"):
|
||||
"""
|
||||
Compute embedding for a summary text via Ollama.
|
||||
|
||||
Used to get ego similarity between entities based on what their summaries
|
||||
say (the actual semantic content) rather than just their names. Aaron's
|
||||
name_embedding and Nelson's name_embedding have low cosine similarity
|
||||
because the names are different tokens. But their summaries describe
|
||||
overlapping content (faculty member at SUNY, HVAMC, etc.) so summary
|
||||
embeddings should produce a much stronger ego signal.
|
||||
"""
|
||||
if not text or len(text) < 10:
|
||||
return None
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/embeddings",
|
||||
json={"model": model, "prompt": text[:2000]},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json().get("embedding")
|
||||
except Exception as e:
|
||||
print(f" Embedding error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def precompute_summary_embeddings(entities, model="nomic-embed-text"):
|
||||
"""Compute and cache summary embeddings for all entities."""
|
||||
print(f"Computing summary embeddings via Ollama ({model})...")
|
||||
print(f" Total entities: {len(entities)}")
|
||||
|
||||
cache_path = Path("/home/aaron/aaronai/experiments/summary_embeddings_cache.json")
|
||||
cache = {}
|
||||
if cache_path.exists():
|
||||
with open(cache_path) as f:
|
||||
cache = json.load(f)
|
||||
print(f" Loaded {len(cache)} cached embeddings")
|
||||
|
||||
new_count = 0
|
||||
start = time.time()
|
||||
for i, e in enumerate(entities):
|
||||
if e["uuid"] in cache:
|
||||
e["summary_embedding"] = cache[e["uuid"]]
|
||||
continue
|
||||
emb = compute_summary_embedding(e["summary"], model=model)
|
||||
if emb:
|
||||
e["summary_embedding"] = emb
|
||||
cache[e["uuid"]] = emb
|
||||
new_count += 1
|
||||
else:
|
||||
e["summary_embedding"] = None
|
||||
|
||||
# Save cache periodically
|
||||
if new_count > 0 and new_count % 100 == 0:
|
||||
with open(cache_path, "w") as f:
|
||||
json.dump(cache, f)
|
||||
elapsed = time.time() - start
|
||||
rate = new_count / elapsed
|
||||
remaining = (len(entities) - i - 1) / rate if rate > 0 else 0
|
||||
print(f" ... {i+1}/{len(entities)} (computed {new_count} new, ~{remaining:.0f}s remaining)")
|
||||
|
||||
# Final save
|
||||
with open(cache_path, "w") as f:
|
||||
json.dump(cache, f)
|
||||
|
||||
have_embeddings = sum(1 for e in entities if e.get("summary_embedding"))
|
||||
print(f" Done. {have_embeddings}/{len(entities)} entities have summary embeddings")
|
||||
|
||||
|
||||
def generate_proposals():
|
||||
db = FalkorDB(host='localhost', port=6379)
|
||||
graph = db.select_graph(GROUP_ID)
|
||||
|
||||
# Pull all entities with embeddings
|
||||
print(f"Fetching entities from group_id '{GROUP_ID}'...")
|
||||
result = graph.query("""
|
||||
MATCH (n:Entity)
|
||||
WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
|
||||
RETURN n.uuid, n.name, n.summary, n.name_embedding
|
||||
""")
|
||||
|
||||
entities = []
|
||||
for row in result.result_set:
|
||||
entities.append({
|
||||
'uuid': row[0],
|
||||
'name': row[1],
|
||||
'summary': row[2],
|
||||
'embedding': row[3],
|
||||
})
|
||||
print(f" Loaded {len(entities)} entities with embeddings")
|
||||
|
||||
# Compute summary embeddings (true ego signal, beyond name embeddings)
|
||||
precompute_summary_embeddings(entities)
|
||||
|
||||
# Infer types for blocking
|
||||
print("Inferring entity types for blocking...")
|
||||
type_counts = defaultdict(int)
|
||||
for e in entities:
|
||||
e['inferred_type'] = infer_type(e['name'], e['summary'])
|
||||
type_counts[e['inferred_type']] += 1
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t}: {c}")
|
||||
|
||||
# Group by inferred type for blocking
|
||||
blocks = defaultdict(list)
|
||||
for e in entities:
|
||||
blocks[e['inferred_type']].append(e)
|
||||
|
||||
# 'unknown' entities get compared against everything (they might be any type)
|
||||
# Other types only get compared within their type block + against unknowns
|
||||
print()
|
||||
print("Comparing entities within type blocks...")
|
||||
proposals = []
|
||||
low_confidence = []
|
||||
comparisons_done = 0
|
||||
|
||||
# Build comparison pairs
|
||||
pairs_to_compare = []
|
||||
typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
|
||||
unknown_block = blocks.get('unknown', [])
|
||||
|
||||
# Within-type pairs (excluding unknown)
|
||||
for t, ents in typed_blocks.items():
|
||||
for i in range(len(ents)):
|
||||
for j in range(i + 1, len(ents)):
|
||||
pairs_to_compare.append((ents[i], ents[j]))
|
||||
|
||||
# Unknown vs unknown
|
||||
for i in range(len(unknown_block)):
|
||||
for j in range(i + 1, len(unknown_block)):
|
||||
pairs_to_compare.append((unknown_block[i], unknown_block[j]))
|
||||
|
||||
# Unknown vs typed (unknowns might be any type)
|
||||
for ent_unknown in unknown_block:
|
||||
for t, ents in typed_blocks.items():
|
||||
for ent_typed in ents:
|
||||
pairs_to_compare.append((ent_unknown, ent_typed))
|
||||
|
||||
print(f" Pairs to compare: {len(pairs_to_compare):,}")
|
||||
|
||||
# Compute similarities
|
||||
cache_neighbors = {}
|
||||
def neighbors_cached(uuid):
|
||||
if uuid not in cache_neighbors:
|
||||
cache_neighbors[uuid] = get_neighbors(graph, uuid)
|
||||
return cache_neighbors[uuid]
|
||||
|
||||
for ent_a, ent_b in pairs_to_compare:
|
||||
comparisons_done += 1
|
||||
if comparisons_done % 5000 == 0:
|
||||
print(f" ... {comparisons_done:,} / {len(pairs_to_compare):,}")
|
||||
|
||||
# Compute name similarity (handles formal/informal pairs, acronyms)
|
||||
name_sim = name_similarity(ent_a['name'], ent_b['name'])
|
||||
|
||||
# Compute ego similarity using SUMMARY embeddings (the actual semantic
|
||||
# content), falling back to name embeddings if summaries unavailable.
|
||||
# Summary similarity catches Aaron+Nelson where name similarity fails.
|
||||
if ent_a.get('summary_embedding') and ent_b.get('summary_embedding'):
|
||||
ego_sim_quick = cosine_similarity(ent_a['summary_embedding'], ent_b['summary_embedding'])
|
||||
else:
|
||||
ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
|
||||
|
||||
# Pre-filter to avoid expensive neighbor query on obviously different pairs.
|
||||
# Lowered thresholds vs DEG-RAG defaults because personal-corpus aliases often
|
||||
# have low name_embedding similarity (different surface tokens) but high
|
||||
# neighbor overlap. We let weaker name/ego signals through to the neighbor
|
||||
# check, which can rescue them via containment metric.
|
||||
if ego_sim_quick < 0.3 and name_sim < 0.15:
|
||||
continue
|
||||
|
||||
# Full comparison
|
||||
neighbors_a = neighbors_cached(ent_a['uuid'])
|
||||
neighbors_b = neighbors_cached(ent_b['uuid'])
|
||||
neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
|
||||
|
||||
confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
|
||||
|
||||
record = {
|
||||
'entity_a': {
|
||||
'uuid': ent_a['uuid'],
|
||||
'name': ent_a['name'],
|
||||
'type': ent_a['inferred_type'],
|
||||
'summary': ent_a['summary'][:200],
|
||||
'edge_count': get_edge_count(graph, ent_a['uuid']),
|
||||
},
|
||||
'entity_b': {
|
||||
'uuid': ent_b['uuid'],
|
||||
'name': ent_b['name'],
|
||||
'type': ent_b['inferred_type'],
|
||||
'summary': ent_b['summary'][:200],
|
||||
'edge_count': get_edge_count(graph, ent_b['uuid']),
|
||||
},
|
||||
'confidence': round(confidence, 3),
|
||||
'signals': {
|
||||
'name_similarity': round(name_sim, 3),
|
||||
'ego_similarity': round(ego_sim_quick, 3),
|
||||
'neighbor_overlap': round(neighbor_sim, 3),
|
||||
},
|
||||
'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
|
||||
}
|
||||
|
||||
if confidence >= HIGH_CONFIDENCE_THRESHOLD:
|
||||
proposals.append(record)
|
||||
elif confidence >= LOW_CONFIDENCE_THRESHOLD:
|
||||
low_confidence.append(record)
|
||||
|
||||
print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
|
||||
return proposals, low_confidence, len(entities), len(pairs_to_compare)
|
||||
|
||||
|
||||
def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
|
||||
out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
|
||||
|
||||
proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
|
||||
low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
|
||||
|
||||
lines = []
|
||||
lines.append(f"# Consolidator 0.1 — Run {timestamp}")
|
||||
lines.append("")
|
||||
lines.append("## Statistics")
|
||||
lines.append(f"- Entities scanned: {total_entities:,}")
|
||||
lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
|
||||
lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
|
||||
lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
|
||||
lines.append("")
|
||||
lines.append("## How to review")
|
||||
lines.append("")
|
||||
lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
|
||||
lines.append("- `[APPROVE]` — execute this merge on next run")
|
||||
lines.append("- `[REJECT]` — don't merge, don't propose again")
|
||||
lines.append("- `[DEFER]` — re-surface in next run for further consideration")
|
||||
lines.append("")
|
||||
lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
lines.append(f"## Proposed Merges (n={len(proposals)})")
|
||||
lines.append("")
|
||||
|
||||
for i, p in enumerate(proposals_sorted, start=1):
|
||||
lines.append(f"### Proposal {i}")
|
||||
lines.append("")
|
||||
lines.append(f"**Decision:** [ ]")
|
||||
lines.append("")
|
||||
lines.append(f"**Confidence:** {p['confidence']}")
|
||||
lines.append("")
|
||||
lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
|
||||
lines.append(f" - uuid: `{p['entity_a']['uuid']}`")
|
||||
lines.append(f" - summary: {p['entity_a']['summary']}")
|
||||
lines.append("")
|
||||
lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
|
||||
lines.append(f" - uuid: `{p['entity_b']['uuid']}`")
|
||||
lines.append(f" - summary: {p['entity_b']['summary']}")
|
||||
lines.append("")
|
||||
lines.append(f"**Signals:**")
|
||||
lines.append(f" - Name similarity: {p['signals']['name_similarity']}")
|
||||
lines.append(f" - Ego (summary) similarity: {p['signals']['ego_similarity']}")
|
||||
lines.append(f" - Neighbor overlap: {p['signals']['neighbor_overlap']}")
|
||||
if p['shared_neighbors']:
|
||||
shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
|
||||
lines.append(f" - Shared neighbors (sample): {shared_str}")
|
||||
lines.append("")
|
||||
lines.append("**Optional rejection note:** ")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
lines.append("")
|
||||
lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
|
||||
lines.append("")
|
||||
for p in low_sorted[:30]:
|
||||
lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
|
||||
if len(low_sorted) > 30:
|
||||
lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
|
||||
|
||||
out_path.write_text("\n".join(lines))
|
||||
print(f"\nProposal log written to: {out_path}")
|
||||
|
||||
# Also save raw JSON for downstream tooling
|
||||
json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump({
|
||||
'run_timestamp': timestamp,
|
||||
'statistics': {
|
||||
'total_entities': total_entities,
|
||||
'total_comparisons': total_comparisons,
|
||||
'proposal_count': len(proposals),
|
||||
'low_confidence_count': len(low_confidence),
|
||||
},
|
||||
'proposals': proposals_sorted,
|
||||
'low_confidence': low_sorted,
|
||||
}, f, indent=2)
|
||||
print(f"Raw JSON: {json_path}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("Consolidator 0.1 — Calibration Phase")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
|
||||
write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
|
||||
|
||||
print()
|
||||
print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
|
||||
print("for each proposal. Re-run will read decisions and execute approved merges.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
Measure actual Graphiti BULK episode cost on a stratified sample.
|
||||
Uses /episodes/bulk endpoint. Submits in small batches to avoid rate limits.
|
||||
"""
|
||||
import json, os, random, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SAMPLE_SIZE = 50
|
||||
BATCH_SIZE = 5
|
||||
RANDOM_SEED = 42
|
||||
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_stratified_sample():
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings
|
||||
GROUP BY source
|
||||
""")
|
||||
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
|
||||
cur.close(); conn.close()
|
||||
|
||||
random.seed(RANDOM_SEED)
|
||||
short = [(s, d) for s, d in sources if len(d) < 1000]
|
||||
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
|
||||
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
|
||||
|
||||
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
|
||||
sample = (
|
||||
random.sample(short, min(15, len(short))) +
|
||||
random.sample(medium, min(25, len(medium))) +
|
||||
random.sample(long_, min(10, len(long_)))
|
||||
)
|
||||
print(f"Sample: {len(sample)} sources, batch_size={BATCH_SIZE}")
|
||||
return sample
|
||||
|
||||
|
||||
def submit_bulk_batch(batch):
|
||||
payload = {
|
||||
"episodes": [
|
||||
{
|
||||
"name": source,
|
||||
"content": doc[:12000],
|
||||
"source_description": "pgvector_migration_bulk_test",
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}
|
||||
for source, doc in batch
|
||||
]
|
||||
}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
|
||||
elapsed = time.time() - t0
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(elapsed, 2),
|
||||
"elapsed_per_episode_s": round(elapsed / len(batch), 2),
|
||||
"response": r.json() if r.ok else None,
|
||||
"error": None if r.ok else r.text[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"elapsed_per_episode_s": None,
|
||||
"response": None,
|
||||
"error": str(e)[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Graphiti BULK Migration Cost Test (Haiku 4.5)")
|
||||
print("=" * 60)
|
||||
print()
|
||||
print("BEFORE running:")
|
||||
print(" 1. Open https://console.anthropic.com/settings/usage")
|
||||
print(" 2. Note current spend.")
|
||||
print()
|
||||
input("Press Enter when noted... ")
|
||||
print()
|
||||
|
||||
sample = fetch_stratified_sample()
|
||||
if not sample:
|
||||
print("ERROR: empty sample"); return
|
||||
|
||||
batches = [sample[i:i+BATCH_SIZE] for i in range(0, len(sample), BATCH_SIZE)]
|
||||
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE} episodes")
|
||||
print()
|
||||
|
||||
results = []
|
||||
total_start = time.time()
|
||||
for i, batch in enumerate(batches, start=1):
|
||||
avg_chars = int(sum(len(d) for _, d in batch) / len(batch))
|
||||
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg_chars:6d}",
|
||||
end=" ", flush=True)
|
||||
result = submit_bulk_batch(batch)
|
||||
results.append(result)
|
||||
if result["error"]:
|
||||
print(f" ERROR: {result['error'][:80]}")
|
||||
if "429" in (result["error"] or "") or "rate" in (result["error"] or "").lower():
|
||||
print(" Rate limited - pausing 30s before next batch")
|
||||
time.sleep(30)
|
||||
else:
|
||||
print(f" {result['status_code']} {result['elapsed_s']}s "
|
||||
f"({result['elapsed_per_episode_s']}s/episode)")
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
successful_batches = [r for r in results if r["error"] is None]
|
||||
failed_batches = [r for r in results if r["error"] is not None]
|
||||
successful_episodes = sum(r["batch_size"] for r in successful_batches)
|
||||
failed_episodes = sum(r["batch_size"] for r in failed_batches)
|
||||
|
||||
summary = {
|
||||
"sample_size": len(sample),
|
||||
"batch_size": BATCH_SIZE,
|
||||
"n_batches": len(batches),
|
||||
"successful_batches": len(successful_batches),
|
||||
"failed_batches": len(failed_batches),
|
||||
"successful_episodes": successful_episodes,
|
||||
"failed_episodes": failed_episodes,
|
||||
"total_elapsed_s": round(total_elapsed, 1),
|
||||
"mean_elapsed_per_episode_s": round(
|
||||
sum(r["elapsed_s"] for r in successful_batches) /
|
||||
max(successful_episodes, 1), 2
|
||||
) if successful_episodes else None,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
|
||||
total_sources = cur.fetchone()[0]
|
||||
cur.close(); conn.close()
|
||||
|
||||
summary["total_corpus_sources"] = total_sources
|
||||
if summary["mean_elapsed_per_episode_s"]:
|
||||
summary["estimated_migration_hours"] = round(
|
||||
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
|
||||
)
|
||||
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Episodes: {summary['successful_episodes']}/{summary['sample_size']} succeeded")
|
||||
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
|
||||
print(f"Total elapsed: {summary['total_elapsed_s']}s")
|
||||
if summary["mean_elapsed_per_episode_s"]:
|
||||
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
|
||||
print(f"Total corpus sources: {summary['total_corpus_sources']}")
|
||||
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
|
||||
print()
|
||||
print(f"AFTER:")
|
||||
print(f" Wait 5 min; note new Anthropic spend; subtract from $28.61 baseline.")
|
||||
print(f" delta / {summary['successful_episodes']} = per-episode cost")
|
||||
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
|
||||
print()
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Retest just the previously-failed batches after raising MAX_QUEUED_QUERIES.
|
||||
Reads failed sources from graphiti_bulk_cost_test.json and resubmits.
|
||||
"""
|
||||
import json, os, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
BATCH_SIZE = 5
|
||||
|
||||
PRIOR_RESULTS = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
|
||||
|
||||
|
||||
def fetch_doc_for_source(cur, source):
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id)
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def submit_bulk_batch(batch):
|
||||
payload = {"episodes": [
|
||||
{"name": s, "content": d[:12000],
|
||||
"source_description": "pgvector_migration_bulk_retry",
|
||||
"timestamp": "2026-04-28T00:00:00"}
|
||||
for s, d in batch
|
||||
]}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"elapsed_per_episode_s": round((time.time() - t0) / len(batch), 2),
|
||||
"error": None if r.ok else r.text[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"elapsed_per_episode_s": None,
|
||||
"error": str(e)[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
prior = json.loads(PRIOR_RESULTS.read_text())
|
||||
failed_sources = []
|
||||
for batch_result in prior["results"]:
|
||||
if batch_result["error"] is not None:
|
||||
failed_sources.extend(batch_result["sources"])
|
||||
print(f"Retrying {len(failed_sources)} previously-failed sources")
|
||||
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
sources_with_docs = []
|
||||
for s in failed_sources:
|
||||
doc = fetch_doc_for_source(cur, s)
|
||||
if doc:
|
||||
sources_with_docs.append((s, doc))
|
||||
else:
|
||||
print(f" WARN: could not find doc for source {s}")
|
||||
cur.close(); conn.close()
|
||||
print(f"Loaded {len(sources_with_docs)} source docs")
|
||||
print()
|
||||
|
||||
batches = [sources_with_docs[i:i+BATCH_SIZE]
|
||||
for i in range(0, len(sources_with_docs), BATCH_SIZE)]
|
||||
|
||||
results = []
|
||||
total_start = time.time()
|
||||
for i, batch in enumerate(batches, start=1):
|
||||
avg = int(sum(len(d) for _, d in batch) / len(batch))
|
||||
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}",
|
||||
end=" ", flush=True)
|
||||
result = submit_bulk_batch(batch)
|
||||
results.append(result)
|
||||
if result["error"]:
|
||||
print(f" ERROR: {result['error'][:80]}")
|
||||
else:
|
||||
print(f" {result['status_code']} {result['elapsed_s']}s")
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
successful = [r for r in results if r["error"] is None]
|
||||
failed = [r for r in results if r["error"] is not None]
|
||||
summary = {
|
||||
"n_retry_sources": len(sources_with_docs),
|
||||
"n_batches": len(batches),
|
||||
"successful_batches": len(successful),
|
||||
"failed_batches": len(failed),
|
||||
"successful_episodes": sum(r["batch_size"] for r in successful),
|
||||
"failed_episodes": sum(r["batch_size"] for r in failed),
|
||||
"total_elapsed_s": round(total_elapsed, 1),
|
||||
"results": results,
|
||||
}
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("RETRY RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Episodes: {summary['successful_episodes']}/{len(sources_with_docs)} succeeded")
|
||||
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
|
||||
print(f"Total elapsed: {summary['total_elapsed_s']}s")
|
||||
print()
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Retry attempt #2 — for sources that timed out after MAX_QUEUED_QUERIES bump."""
|
||||
import json, os, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
BATCH_SIZE = 3 # smaller batches given timeouts
|
||||
|
||||
PRIOR = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry2.json"
|
||||
|
||||
|
||||
def fetch_doc(cur, source):
|
||||
cur.execute("SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id) FROM embeddings WHERE source = %s", (source,))
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def submit_batch(batch):
|
||||
payload = {"episodes": [
|
||||
{"name": s, "content": d[:12000],
|
||||
"source_description": "pgvector_migration_bulk_retry2",
|
||||
"timestamp": "2026-04-28T00:00:00"}
|
||||
for s, d in batch
|
||||
]}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": None if r.ok else r.text[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": str(e)[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
prior = json.loads(PRIOR.read_text())
|
||||
failed = []
|
||||
for r in prior["results"]:
|
||||
if r["error"] is not None:
|
||||
failed.extend(r["sources"])
|
||||
print(f"Retry #2: {len(failed)} sources still failing")
|
||||
|
||||
conn = psycopg2.connect(PG_DSN); cur = conn.cursor()
|
||||
sources = []
|
||||
for s in failed:
|
||||
d = fetch_doc(cur, s)
|
||||
if d: sources.append((s, d))
|
||||
cur.close(); conn.close()
|
||||
|
||||
batches = [sources[i:i+BATCH_SIZE] for i in range(0, len(sources), BATCH_SIZE)]
|
||||
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE}\n")
|
||||
|
||||
results = []
|
||||
for i, batch in enumerate(batches, 1):
|
||||
avg = int(sum(len(d) for _, d in batch) / len(batch))
|
||||
print(f"[batch {i}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}", end=" ", flush=True)
|
||||
r = submit_batch(batch)
|
||||
results.append(r)
|
||||
if r["error"]: print(f" ERROR: {r['error'][:80]}")
|
||||
else: print(f" {r['status_code']} {r['elapsed_s']}s")
|
||||
|
||||
succ = [r for r in results if r["error"] is None]
|
||||
fail = [r for r in results if r["error"] is not None]
|
||||
summary = {
|
||||
"n_sources": len(sources),
|
||||
"successful_batches": len(succ),
|
||||
"failed_batches": len(fail),
|
||||
"successful_episodes": sum(r["batch_size"] for r in succ),
|
||||
"failed_episodes": sum(r["batch_size"] for r in fail),
|
||||
"results": results,
|
||||
}
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
print()
|
||||
print(f"Episodes: {summary['successful_episodes']}/{len(sources)} succeeded")
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Measure actual Graphiti episode-add cost on a stratified sample of pgvector sources.
|
||||
"""
|
||||
import json, os, random, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SAMPLE_SIZE = 50
|
||||
RANDOM_SEED = 42
|
||||
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_cost_test.json"
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_stratified_sample():
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings
|
||||
GROUP BY source
|
||||
""")
|
||||
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
|
||||
cur.close(); conn.close()
|
||||
|
||||
random.seed(RANDOM_SEED)
|
||||
short = [(s, d) for s, d in sources if len(d) < 1000]
|
||||
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
|
||||
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
|
||||
|
||||
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
|
||||
sample = (
|
||||
random.sample(short, min(15, len(short))) +
|
||||
random.sample(medium, min(25, len(medium))) +
|
||||
random.sample(long_, min(10, len(long_)))
|
||||
)
|
||||
print(f"Sample: {len(sample)} sources")
|
||||
return sample
|
||||
|
||||
|
||||
def submit_episode(source: str, document: str) -> dict:
|
||||
payload = {
|
||||
"name": source,
|
||||
"content": document[:12000],
|
||||
"source_description": "pgvector_migration_cost_test",
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes", json=payload, timeout=600)
|
||||
return {
|
||||
"source": source,
|
||||
"doc_chars": len(document),
|
||||
"doc_chars_sent": min(len(document), 12000),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": None if r.ok else r.text[:500],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"source": source,
|
||||
"doc_chars": len(document),
|
||||
"doc_chars_sent": min(len(document), 12000),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": str(e)[:500],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Graphiti Migration Cost Test (Haiku 4.5)")
|
||||
print("=" * 60)
|
||||
print()
|
||||
print("BEFORE running:")
|
||||
print(" 1. Open https://console.anthropic.com/settings/usage")
|
||||
print(" 2. Note current spend.")
|
||||
print()
|
||||
input("Press Enter when noted... ")
|
||||
print()
|
||||
|
||||
sample = fetch_stratified_sample()
|
||||
if not sample:
|
||||
print("ERROR: empty sample"); return
|
||||
|
||||
# Smoke test
|
||||
print(f"Smoke test on first source ({sample[0][0][:50]}...):")
|
||||
smoke = submit_episode(*sample[0])
|
||||
print(f" status={smoke['status_code']} elapsed={smoke['elapsed_s']}s")
|
||||
if smoke["error"]:
|
||||
print(f" ERROR: {smoke['error']}")
|
||||
OUT.write_text(json.dumps({"smoke_test": smoke}, indent=2))
|
||||
print("Halted — fix smoke test before bulk run.")
|
||||
return
|
||||
print(f" OK. Proceeding with {len(sample)} sources.")
|
||||
print()
|
||||
|
||||
results = [smoke]
|
||||
total_start = time.time()
|
||||
for i, (source, doc) in enumerate(sample[1:], start=2):
|
||||
bucket = "short" if len(doc) < 1000 else "medium" if len(doc) < 5000 else "long"
|
||||
print(f"[{i:2d}/{len(sample)}] [{bucket:6s}] [{len(doc):6d}c] {source[:50]:50s}", end=" ", flush=True)
|
||||
result = submit_episode(source, doc)
|
||||
results.append(result)
|
||||
if result["error"]:
|
||||
print(f" ERROR: {result['error'][:80]}")
|
||||
else:
|
||||
print(f" {result['status_code']} {result['elapsed_s']}s")
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
successful = [r for r in results if r["error"] is None]
|
||||
failed = [r for r in results if r["error"] is not None]
|
||||
|
||||
summary = {
|
||||
"sample_size": len(sample),
|
||||
"successful": len(successful),
|
||||
"failed": len(failed),
|
||||
"total_elapsed_s": round(total_elapsed, 1),
|
||||
"mean_elapsed_per_episode_s": round(
|
||||
sum(r["elapsed_s"] for r in successful) / max(len(successful), 1), 2
|
||||
),
|
||||
"by_bucket": {},
|
||||
"results": results,
|
||||
}
|
||||
|
||||
for bname, lo, hi in [("short", 0, 1000), ("medium", 1000, 5000), ("long", 5000, 10**9)]:
|
||||
b = [r for r in successful if lo <= r["doc_chars"] < hi]
|
||||
if b:
|
||||
summary["by_bucket"][bname] = {
|
||||
"n": len(b),
|
||||
"mean_elapsed_s": round(sum(r["elapsed_s"] for r in b) / len(b), 2),
|
||||
"mean_chars": int(sum(r["doc_chars"] for r in b) / len(b)),
|
||||
}
|
||||
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
|
||||
total_sources = cur.fetchone()[0]
|
||||
cur.close(); conn.close()
|
||||
|
||||
summary["total_corpus_sources"] = total_sources
|
||||
summary["estimated_migration_hours"] = round(
|
||||
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
|
||||
)
|
||||
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Sample: {summary['successful']}/{summary['sample_size']} succeeded, {summary['failed']} failed")
|
||||
print(f"Total elapsed: {summary['total_elapsed_s']}s")
|
||||
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
|
||||
for bucket, stats in summary["by_bucket"].items():
|
||||
print(f" {bucket:6s} n={stats['n']:3d} chars~{stats['mean_chars']:6d} elapsed~{stats['mean_elapsed_s']}s")
|
||||
print()
|
||||
print(f"Total corpus sources: {summary['total_corpus_sources']}")
|
||||
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
|
||||
print()
|
||||
print("AFTER:")
|
||||
print(" Wait 5 min; note new Anthropic spend; subtract.")
|
||||
print(f" test_cost / {summary['successful']} = per-episode cost")
|
||||
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
|
||||
print()
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
E1.4 per-source predicate diversity comparison — fixed version.
|
||||
Looks up episode uuids by name in both production and cascade graphs.
|
||||
"""
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from falkordb import FalkorDB
|
||||
|
||||
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
|
||||
PRODUCTION_GROUP = "aaron"
|
||||
CASCADE_GROUP = "aaron_cascade_e14"
|
||||
|
||||
def get_predicates_for_episode(graph, episode_uuid):
|
||||
query = """
|
||||
MATCH ()-[r:RELATES_TO]->()
|
||||
WHERE $uuid IN r.episodes
|
||||
RETURN count(DISTINCT r.name) AS predicate_count
|
||||
"""
|
||||
result = graph.query(query, {"uuid": episode_uuid})
|
||||
rows = result.result_set
|
||||
return rows[0][0] if rows else 0
|
||||
|
||||
def get_edge_count_for_episode(graph, episode_uuid):
|
||||
query = """
|
||||
MATCH ()-[r:RELATES_TO]->()
|
||||
WHERE $uuid IN r.episodes
|
||||
RETURN count(r) AS edge_count
|
||||
"""
|
||||
result = graph.query(query, {"uuid": episode_uuid})
|
||||
rows = result.result_set
|
||||
return rows[0][0] if rows else 0
|
||||
|
||||
def find_episode_uuid(graph, source_name):
|
||||
query = """
|
||||
MATCH (e:Episodic {name: $name})
|
||||
RETURN e.uuid AS uuid
|
||||
LIMIT 1
|
||||
"""
|
||||
result = graph.query(query, {"name": source_name})
|
||||
rows = result.result_set
|
||||
return rows[0][0] if rows else None
|
||||
|
||||
def main():
|
||||
db = FalkorDB(host='localhost', port=6379)
|
||||
prod_graph = db.select_graph(PRODUCTION_GROUP)
|
||||
cascade_graph = db.select_graph(CASCADE_GROUP)
|
||||
|
||||
with open(E14_RESULTS) as f:
|
||||
e14 = json.load(f)
|
||||
|
||||
sources = [r for r in e14['results'] if 'submit_result' in r]
|
||||
print(f"Analyzing {len(sources)} sources...")
|
||||
print()
|
||||
|
||||
comparisons = []
|
||||
missing_prod = 0
|
||||
missing_cascade = 0
|
||||
for src in sources:
|
||||
name = src['name']
|
||||
bucket = src['bucket']
|
||||
|
||||
prod_uuid = find_episode_uuid(prod_graph, name)
|
||||
cascade_uuid = find_episode_uuid(cascade_graph, name)
|
||||
|
||||
if not prod_uuid:
|
||||
missing_prod += 1
|
||||
print(f" WARN: missing in production: {name}")
|
||||
continue
|
||||
if not cascade_uuid:
|
||||
missing_cascade += 1
|
||||
print(f" WARN: missing in cascade: {name}")
|
||||
continue
|
||||
|
||||
prod_preds = get_predicates_for_episode(prod_graph, prod_uuid)
|
||||
cascade_preds = get_predicates_for_episode(cascade_graph, cascade_uuid)
|
||||
prod_edges = get_edge_count_for_episode(prod_graph, prod_uuid)
|
||||
cascade_edges = get_edge_count_for_episode(cascade_graph, cascade_uuid)
|
||||
|
||||
comparisons.append({
|
||||
"name": name,
|
||||
"bucket": bucket,
|
||||
"prod_preds": prod_preds,
|
||||
"cascade_preds": cascade_preds,
|
||||
"delta_preds": cascade_preds - prod_preds,
|
||||
"prod_edges": prod_edges,
|
||||
"cascade_edges": cascade_edges,
|
||||
"delta_edges": cascade_edges - prod_edges,
|
||||
})
|
||||
|
||||
if missing_prod or missing_cascade:
|
||||
print()
|
||||
print(f"Missing: {missing_prod} prod, {missing_cascade} cascade")
|
||||
print()
|
||||
|
||||
if not comparisons:
|
||||
print("No comparable sources found. Aborting.")
|
||||
return
|
||||
|
||||
# Per-source detail
|
||||
print(f"{'Bucket':<10} {'Source':<58} {'Preds A→B':<14} {'Δ':<6} {'Edges A→B':<14} {'Δ'}")
|
||||
print("-" * 115)
|
||||
for c in sorted(comparisons, key=lambda x: (x['bucket'], x['name'])):
|
||||
name_short = (c['name'][:55] + '..') if len(c['name']) > 58 else c['name']
|
||||
preds_str = f"{c['prod_preds']}→{c['cascade_preds']}"
|
||||
edges_str = f"{c['prod_edges']}→{c['cascade_edges']}"
|
||||
print(f"{c['bucket']:<10} {name_short:<58} {preds_str:<14} {c['delta_preds']:+d} {edges_str:<14} {c['delta_edges']:+d}")
|
||||
|
||||
# Per-bucket aggregation
|
||||
print()
|
||||
print("=" * 115)
|
||||
print("PER-BUCKET AGGREGATION")
|
||||
print("=" * 115)
|
||||
by_bucket = defaultdict(list)
|
||||
for c in comparisons:
|
||||
by_bucket[c['bucket']].append(c)
|
||||
|
||||
for bucket in ['high', 'mid', 'low', 'document']:
|
||||
items = by_bucket.get(bucket, [])
|
||||
if not items:
|
||||
continue
|
||||
n = len(items)
|
||||
sum_pp = sum(c['prod_preds'] for c in items)
|
||||
sum_cp = sum(c['cascade_preds'] for c in items)
|
||||
sum_pe = sum(c['prod_edges'] for c in items)
|
||||
sum_ce = sum(c['cascade_edges'] for c in items)
|
||||
positive = sum(1 for c in items if c['delta_preds'] > 0)
|
||||
negative = sum(1 for c in items if c['delta_preds'] < 0)
|
||||
flat = sum(1 for c in items if c['delta_preds'] == 0)
|
||||
pct_pred = ((sum_cp - sum_pp) / sum_pp * 100) if sum_pp else 0
|
||||
pct_edge = ((sum_ce - sum_pe) / sum_pe * 100) if sum_pe else 0
|
||||
print(f"\n{bucket.upper()} (n={n}):")
|
||||
print(f" Predicates: {sum_pp} → {sum_cp} ({pct_pred:+.1f}%)")
|
||||
print(f" Edges: {sum_pe} → {sum_ce} ({pct_edge:+.1f}%)")
|
||||
print(f" Outcomes: {positive} positive, {flat} flat, {negative} negative")
|
||||
|
||||
# Aggregate
|
||||
print()
|
||||
print("=" * 115)
|
||||
print(f"AGGREGATE (n={len(comparisons)})")
|
||||
print("=" * 115)
|
||||
total_pp = sum(c['prod_preds'] for c in comparisons)
|
||||
total_cp = sum(c['cascade_preds'] for c in comparisons)
|
||||
total_pe = sum(c['prod_edges'] for c in comparisons)
|
||||
total_ce = sum(c['cascade_edges'] for c in comparisons)
|
||||
print(f" Predicates: {total_pp} → {total_cp} ({(total_cp-total_pp)/total_pp*100:+.1f}%)")
|
||||
print(f" Edges: {total_pe} → {total_ce} ({(total_ce-total_pe)/total_pe*100:+.1f}%)")
|
||||
|
||||
out_path = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(comparisons, f, indent=2)
|
||||
print()
|
||||
print(f"Saved to {out_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1.4 orchestration — cascade re-extraction at n=30, group_id=aaron_cascade_e14."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "e14_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "e14_cascade_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_e14"
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text, max_retries=2):
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
last_err = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=300,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e:
|
||||
last_err = e
|
||||
if attempt < max_retries - 1:
|
||||
print(f" (retry {attempt+1} after {type(e).__name__})", end=" ", flush=True)
|
||||
time.sleep(5)
|
||||
continue
|
||||
return {"error": f"After {max_retries} retries: {last_err}"}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
if "error" in metadata:
|
||||
return None
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode_singular(name, content, custom_instructions):
|
||||
payload = {
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": "e14_replication_run",
|
||||
"timestamp": "2026-04-29T00:00:00",
|
||||
"group_id": TEST_GROUP_ID,
|
||||
"custom_extraction_instructions": custom_instructions,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def load_state():
|
||||
if RESULTS_FILE.exists():
|
||||
with open(RESULTS_FILE) as f:
|
||||
data = json.load(f)
|
||||
return data.get("results", []), {r["name"] for r in data.get("results", []) if "submit_result" in r}
|
||||
return [], set()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
|
||||
results, completed = load_state()
|
||||
if completed:
|
||||
print(f"Resuming — {len(completed)} sources already completed, {len(selected) - len(completed)} remaining\n")
|
||||
else:
|
||||
print(f"E1.4 cascade replication — {len(selected)} episodes to group_id={TEST_GROUP_ID}\n")
|
||||
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
if name in completed:
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name} — SKIP (already completed)")
|
||||
continue
|
||||
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
if ep.get("subtype"):
|
||||
record["subtype"] = ep["subtype"]
|
||||
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
custom_instructions = format_metadata_as_orientation(metadata)
|
||||
record["custom_extraction_instructions"] = custom_instructions
|
||||
print(f" Submitting via /episodes...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode_singular(name, text, custom_instructions)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
OUTPUT = EXPERIMENTS / "e14_sample.json"
|
||||
|
||||
TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}
|
||||
|
||||
def query_episode_counts():
|
||||
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
|
||||
"RETURN e.name AS name, count(distinct n) AS entities "
|
||||
"ORDER BY entities DESC")
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
lines = [l for l in result.stdout.split("\n") if l.strip()]
|
||||
episodes = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if lines[i] == "name":
|
||||
i += 2
|
||||
continue
|
||||
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
|
||||
break
|
||||
if i + 1 < len(lines):
|
||||
try:
|
||||
count = int(lines[i + 1])
|
||||
episodes.append({"name": lines[i], "entities": count})
|
||||
i += 2
|
||||
except ValueError:
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
return episodes
|
||||
|
||||
|
||||
def is_document(name):
|
||||
return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))
|
||||
|
||||
|
||||
def doc_subtype(name):
|
||||
"""Categorize document by likely subtype."""
|
||||
s = name.lower()
|
||||
if "syllabus" in s or "ind study" in s or "_is" in s:
|
||||
return "academic"
|
||||
if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
|
||||
return "reference"
|
||||
if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
|
||||
return "reference"
|
||||
if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
|
||||
return "creative"
|
||||
return "other"
|
||||
|
||||
|
||||
def main():
|
||||
print("Fetching episode entity counts from Tier 1 graph...")
|
||||
episodes = query_episode_counts()
|
||||
print(f"Got {len(episodes)} episodes")
|
||||
|
||||
# Load E1's sample to exclude
|
||||
with open(E1_SAMPLE_FILE) as f:
|
||||
e1_sample = json.load(f)
|
||||
e1_names = {ep["name"] for ep in e1_sample["selected"]}
|
||||
print(f"Excluding {len(e1_names)} sources from E1")
|
||||
|
||||
# Quartile boundaries
|
||||
counts = sorted([e["entities"] for e in episodes], reverse=True)
|
||||
n = len(counts)
|
||||
top_q = counts[n // 4]
|
||||
bottom_q = counts[3 * n // 4]
|
||||
print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")
|
||||
|
||||
# Filter out E1 and bucket
|
||||
available = [e for e in episodes if e["name"] not in e1_names]
|
||||
|
||||
high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
|
||||
mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
|
||||
low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
|
||||
docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]
|
||||
|
||||
print(f"\nAvailable after E1 exclusion:")
|
||||
print(f" High-density: {len(high)}")
|
||||
print(f" Mid-density: {len(mid)}")
|
||||
print(f" Low-density: {len(low)}")
|
||||
print(f" Documents: {len(docs)}")
|
||||
|
||||
# For high/mid/low: take from middle of bucket (avoids edge cases)
|
||||
def pick(bucket, n):
|
||||
if len(bucket) < n:
|
||||
print(f" WARNING: only {len(bucket)} available, asked for {n}")
|
||||
return bucket
|
||||
mid_idx = len(bucket) // 2
|
||||
start = max(0, mid_idx - n // 2)
|
||||
return bucket[start:start + n]
|
||||
|
||||
selected = []
|
||||
for ep in pick(high, TARGETS["high"]):
|
||||
ep["bucket"] = "high"
|
||||
selected.append(ep)
|
||||
for ep in pick(mid, TARGETS["mid"]):
|
||||
ep["bucket"] = "mid"
|
||||
selected.append(ep)
|
||||
for ep in pick(low, TARGETS["low"]):
|
||||
ep["bucket"] = "low"
|
||||
selected.append(ep)
|
||||
|
||||
# For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
|
||||
doc_targets = {"academic": 2, "creative": 2, "reference": 2}
|
||||
docs_by_subtype = {}
|
||||
for ep in docs:
|
||||
st = doc_subtype(ep["name"])
|
||||
ep["subtype"] = st
|
||||
docs_by_subtype.setdefault(st, []).append(ep)
|
||||
print(f"\n Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")
|
||||
|
||||
# Pick from middle of each subtype bucket
|
||||
for subtype, target in doc_targets.items():
|
||||
sub_docs = docs_by_subtype.get(subtype, [])
|
||||
picked = pick(sub_docs, target)
|
||||
for ep in picked:
|
||||
ep["bucket"] = "document"
|
||||
selected.append(ep)
|
||||
|
||||
# If we're short on documents (e.g., subtype underrepresented), fill from "other"
|
||||
doc_count = sum(1 for s in selected if s.get("bucket") == "document")
|
||||
if doc_count < TARGETS["document"]:
|
||||
shortage = TARGETS["document"] - doc_count
|
||||
leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
|
||||
for ep in leftover[:shortage]:
|
||||
ep["bucket"] = "document"
|
||||
ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
|
||||
selected.append(ep)
|
||||
|
||||
print(f"\nSelected {len(selected)} episodes for E1.4:")
|
||||
for ep in selected:
|
||||
sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
|
||||
print(f" [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e {ep['name']}")
|
||||
|
||||
with open(OUTPUT, "w") as f:
|
||||
json.dump({
|
||||
"metadata": {
|
||||
"purpose": "E1.4 cascade re-extraction replication (n=30)",
|
||||
"exclusions": "E1's 10 sources",
|
||||
"stratification": {**TARGETS, "document_subtypes": doc_targets},
|
||||
"quartile_top": top_q,
|
||||
"quartile_bottom": bottom_q,
|
||||
},
|
||||
"selected": selected,
|
||||
}, f, indent=2)
|
||||
print(f"\nSaved to {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
E1.6 analysis — correlate domain-purity ratings with cascade outcomes.
|
||||
Applies pre-registered decision rules from E1.6 protocol.
|
||||
"""
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
RATINGS_PATH = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
|
||||
COMPARISON_PATH = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
|
||||
|
||||
|
||||
def spearman(xs, ys):
|
||||
"""Compute Spearman rank correlation."""
|
||||
n = len(xs)
|
||||
if n < 2:
|
||||
return None
|
||||
# Rank the values
|
||||
def rank(values):
|
||||
sorted_idx = sorted(range(len(values)), key=lambda i: values[i])
|
||||
ranks = [0] * len(values)
|
||||
i = 0
|
||||
while i < len(values):
|
||||
j = i
|
||||
while j + 1 < len(values) and values[sorted_idx[j+1]] == values[sorted_idx[i]]:
|
||||
j += 1
|
||||
avg_rank = (i + j) / 2 + 1
|
||||
for k in range(i, j + 1):
|
||||
ranks[sorted_idx[k]] = avg_rank
|
||||
i = j + 1
|
||||
return ranks
|
||||
rx = rank(xs)
|
||||
ry = rank(ys)
|
||||
mean_rx = sum(rx) / n
|
||||
mean_ry = sum(ry) / n
|
||||
num = sum((rx[i] - mean_rx) * (ry[i] - mean_ry) for i in range(n))
|
||||
den_x = (sum((rx[i] - mean_rx) ** 2 for i in range(n))) ** 0.5
|
||||
den_y = (sum((ry[i] - mean_ry) ** 2 for i in range(n))) ** 0.5
|
||||
if den_x == 0 or den_y == 0:
|
||||
return None
|
||||
return num / (den_x * den_y)
|
||||
|
||||
|
||||
def main():
|
||||
with open(RATINGS_PATH) as f:
|
||||
ratings_data = json.load(f)
|
||||
with open(COMPARISON_PATH) as f:
|
||||
comparisons = json.load(f)
|
||||
|
||||
ratings_by_name = {r['name']: r for r in ratings_data['ratings']}
|
||||
comp_by_name = {c['name']: c for c in comparisons}
|
||||
|
||||
# Join ratings with cascade outcomes
|
||||
joined = []
|
||||
for name, rating in ratings_by_name.items():
|
||||
if name in comp_by_name:
|
||||
comp = comp_by_name[name]
|
||||
joined.append({
|
||||
'name': name,
|
||||
'binary': rating['binary'],
|
||||
'score': rating['score'],
|
||||
'note': rating.get('note'),
|
||||
'bucket': comp['bucket'],
|
||||
'delta_preds': comp['delta_preds'],
|
||||
'delta_edges': comp['delta_edges'],
|
||||
'prod_preds': comp['prod_preds'],
|
||||
'cascade_preds': comp['cascade_preds'],
|
||||
})
|
||||
|
||||
print("=" * 100)
|
||||
print(f"E1.6 ANALYSIS — Domain Purity vs Cascade Outcome (n={len(joined)})")
|
||||
print("=" * 100)
|
||||
|
||||
# Per-source detail with rating
|
||||
print()
|
||||
print(f"{'Bucket':<10} {'Source':<48} {'Domain':<8} {'Score':<6} {'Δpreds':<8} {'Δedges':<8}")
|
||||
print("-" * 100)
|
||||
for j in sorted(joined, key=lambda x: (x['binary'], -x['score'], x['bucket'], x['name'])):
|
||||
name_short = (j['name'][:45] + '..') if len(j['name']) > 48 else j['name']
|
||||
print(f"{j['bucket']:<10} {name_short:<48} {j['binary']:<8} {j['score']:<6} {j['delta_preds']:+d} {j['delta_edges']:+d}")
|
||||
|
||||
# PRIMARY TEST: binary purity vs cascade outcome distribution
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("PRIMARY TEST: Binary purity vs cascade outcome distribution")
|
||||
print("=" * 100)
|
||||
|
||||
def categorize_outcome(delta):
|
||||
if delta > 0:
|
||||
return 'positive'
|
||||
elif delta < 0:
|
||||
return 'negative'
|
||||
else:
|
||||
return 'flat'
|
||||
|
||||
by_binary = defaultdict(lambda: {'positive': 0, 'flat': 0, 'negative': 0, 'total': 0})
|
||||
for j in joined:
|
||||
outcome = categorize_outcome(j['delta_preds'])
|
||||
by_binary[j['binary']][outcome] += 1
|
||||
by_binary[j['binary']]['total'] += 1
|
||||
|
||||
print()
|
||||
print(f"{'Group':<15} {'n':<5} {'Positive':<12} {'Flat':<10} {'Negative':<12}")
|
||||
print("-" * 60)
|
||||
for binary in ['single', 'multi']:
|
||||
d = by_binary[binary]
|
||||
n = d['total']
|
||||
if n == 0:
|
||||
continue
|
||||
pos_pct = d['positive'] / n * 100
|
||||
flat_pct = d['flat'] / n * 100
|
||||
neg_pct = d['negative'] / n * 100
|
||||
print(f"{binary+'-domain':<15} {n:<5} {d['positive']} ({pos_pct:.0f}%) {d['flat']} ({flat_pct:.0f}%) {d['negative']} ({neg_pct:.0f}%)")
|
||||
|
||||
# Compute the gap
|
||||
if by_binary['single']['total'] > 0 and by_binary['multi']['total'] > 0:
|
||||
single_pos_rate = by_binary['single']['positive'] / by_binary['single']['total'] * 100
|
||||
multi_pos_rate = by_binary['multi']['positive'] / by_binary['multi']['total'] * 100
|
||||
gap = single_pos_rate - multi_pos_rate
|
||||
print()
|
||||
print(f"Cascade-positive rate gap (single - multi): {gap:+.1f} percentage points")
|
||||
print()
|
||||
# Apply pre-registered decision rule
|
||||
if gap >= 20:
|
||||
verdict = "NARROWNESS HYPOTHESIS SUPPORTED"
|
||||
detail = f"Single-domain content is {gap:.0f}pp more likely to gain from cascade than multi-domain."
|
||||
elif gap <= -20:
|
||||
verdict = "REVERSE OF HYPOTHESIS"
|
||||
detail = f"Multi-domain content unexpectedly benefits more (counter to prediction)."
|
||||
elif abs(gap) < 10:
|
||||
verdict = "HYPOTHESIS NOT SUPPORTED"
|
||||
detail = "Domain purity does not appear to predict cascade outcome."
|
||||
else:
|
||||
verdict = "INCONCLUSIVE"
|
||||
detail = f"Gap of {gap:+.0f}pp is suggestive but below the pre-registered 20pp threshold."
|
||||
print(f" Pre-registered decision rule: {verdict}")
|
||||
print(f" {detail}")
|
||||
|
||||
# SECONDARY TEST: Spearman correlation between purity score and predicate delta
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("SECONDARY TEST: Spearman rank correlation (purity score vs predicate delta)")
|
||||
print("=" * 100)
|
||||
|
||||
scores = [j['score'] for j in joined]
|
||||
deltas_pred = [j['delta_preds'] for j in joined]
|
||||
deltas_edge = [j['delta_edges'] for j in joined]
|
||||
|
||||
rho_pred = spearman(scores, deltas_pred)
|
||||
rho_edge = spearman(scores, deltas_edge)
|
||||
|
||||
print()
|
||||
print(f" Spearman ρ (purity score vs Δpredicates): {rho_pred:.3f}")
|
||||
print(f" Spearman ρ (purity score vs Δedges): {rho_edge:.3f}")
|
||||
print()
|
||||
|
||||
if rho_pred is not None:
|
||||
if rho_pred >= 0.4:
|
||||
v = "STRONG POSITIVE — narrowness hypothesis supported with monotonic relationship"
|
||||
elif rho_pred >= 0.2:
|
||||
v = "WEAK POSITIVE — consistent with hypothesis but not strong evidence"
|
||||
elif rho_pred <= -0.2:
|
||||
v = "NEGATIVE — refutes hypothesis"
|
||||
else:
|
||||
v = "NO CORRELATION — hypothesis not supported"
|
||||
print(f" Predicate delta verdict: {v}")
|
||||
print()
|
||||
|
||||
# TERTIARY TEST: within-bucket correlation
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("TERTIARY TEST: Within-bucket correlation")
|
||||
print("=" * 100)
|
||||
|
||||
by_bucket = defaultdict(list)
|
||||
for j in joined:
|
||||
by_bucket[j['bucket']].append(j)
|
||||
|
||||
print()
|
||||
print(f"{'Bucket':<12} {'n':<5} {'Single':<10} {'Multi':<10} {'ρ (score vs Δpred)':<22}")
|
||||
print("-" * 75)
|
||||
for bucket in ['high', 'mid', 'low', 'document']:
|
||||
items = by_bucket.get(bucket, [])
|
||||
if not items:
|
||||
continue
|
||||
n = len(items)
|
||||
n_single = sum(1 for j in items if j['binary'] == 'single')
|
||||
n_multi = sum(1 for j in items if j['binary'] == 'multi')
|
||||
if n >= 3:
|
||||
scores_b = [j['score'] for j in items]
|
||||
deltas_b = [j['delta_preds'] for j in items]
|
||||
rho_b = spearman(scores_b, deltas_b)
|
||||
rho_str = f"{rho_b:+.3f}" if rho_b is not None else "n/a (no variance)"
|
||||
else:
|
||||
rho_str = "n/a (too few)"
|
||||
print(f"{bucket:<12} {n:<5} {n_single:<10} {n_multi:<10} {rho_str}")
|
||||
|
||||
# Interaction with bucket: do single/multi outcomes differ within bucket?
|
||||
print()
|
||||
print("Per-bucket cascade-positive rate by binary purity:")
|
||||
print()
|
||||
print(f"{'Bucket':<12} {'Single':<25} {'Multi':<25}")
|
||||
print("-" * 65)
|
||||
for bucket in ['high', 'mid', 'low', 'document']:
|
||||
items = by_bucket.get(bucket, [])
|
||||
if not items:
|
||||
continue
|
||||
single_items = [j for j in items if j['binary'] == 'single']
|
||||
multi_items = [j for j in items if j['binary'] == 'multi']
|
||||
def rate_str(group):
|
||||
if not group:
|
||||
return "—"
|
||||
pos = sum(1 for j in group if j['delta_preds'] > 0)
|
||||
return f"{pos}/{len(group)} positive ({pos/len(group)*100:.0f}%)"
|
||||
print(f"{bucket:<12} {rate_str(single_items):<25} {rate_str(multi_items):<25}")
|
||||
|
||||
# MEAN DELTA by binary group
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("MEAN PREDICATE DELTA BY GROUP")
|
||||
print("=" * 100)
|
||||
print()
|
||||
for binary in ['single', 'multi']:
|
||||
items = [j for j in joined if j['binary'] == binary]
|
||||
if not items:
|
||||
continue
|
||||
n = len(items)
|
||||
mean_dp = sum(j['delta_preds'] for j in items) / n
|
||||
mean_de = sum(j['delta_edges'] for j in items) / n
|
||||
sum_pp = sum(j['prod_preds'] for j in items)
|
||||
sum_cp = sum(j['cascade_preds'] for j in items)
|
||||
pct_change = (sum_cp - sum_pp) / sum_pp * 100 if sum_pp else 0
|
||||
print(f"{binary}-domain (n={n}):")
|
||||
print(f" Mean Δpredicates per source: {mean_dp:+.2f}")
|
||||
print(f" Mean Δedges per source: {mean_de:+.2f}")
|
||||
print(f" Aggregate predicate change: {sum_pp} → {sum_cp} ({pct_change:+.1f}%)")
|
||||
print()
|
||||
|
||||
# Save joined data for the experiments log writeup
|
||||
out_path = "/home/aaron/aaronai/experiments/e16_joined_analysis.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(joined, f, indent=2)
|
||||
print(f"Joined data saved to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
E1.6 domain-purity rating interface — with full metadata context.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
|
||||
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
|
||||
RATINGS_OUT = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
|
||||
|
||||
INTRO = """
|
||||
================================================================================
|
||||
E1.6 — DOMAIN-PURITY RATING
|
||||
================================================================================
|
||||
|
||||
Two ratings per source:
|
||||
|
||||
1. BINARY — single-domain (s) or multi-domain (m)?
|
||||
|
||||
Mental test: "If Mistral had to pick ONE domain class for this source,
|
||||
would picking just one significantly UNDER-DESCRIBE the content?"
|
||||
|
||||
YES → MULTI-DOMAIN (m) — content lives across two+ frames meaningfully
|
||||
NO → SINGLE-DOMAIN (s) — content fits cleanly within one frame
|
||||
|
||||
2. SCORE (1-5) — how cleanly does it fit?
|
||||
|
||||
5 = unambiguously one domain
|
||||
4 = primarily one domain, slight other element
|
||||
3 = balanced two-domain
|
||||
2 = primarily two-domain with traces of a third
|
||||
1 = three or more domain frames weighted significantly
|
||||
|
||||
Single binary usually = score 4-5
|
||||
Multi binary usually = score 1-3
|
||||
|
||||
You see for each source: name, length, AND the full Mistral metadata block
|
||||
(domain_class, primary_format, structural_signals, content_signals, summary).
|
||||
|
||||
Blind to: bucket assignment, cascade outcome.
|
||||
|
||||
Commands at any prompt: 's', 'm', 'skip', 'quit'
|
||||
================================================================================
|
||||
""".strip()
|
||||
|
||||
|
||||
def load_existing():
|
||||
if os.path.exists(RATINGS_OUT):
|
||||
with open(RATINGS_OUT) as f:
|
||||
return json.load(f)
|
||||
return {"ratings": [], "completed_names": []}
|
||||
|
||||
def save(data):
|
||||
with open(RATINGS_OUT, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
def render_metadata(metadata):
|
||||
"""Pretty-print the full Mistral metadata block."""
|
||||
if not isinstance(metadata, dict):
|
||||
print(" (metadata unavailable)")
|
||||
return
|
||||
if 'error' in metadata:
|
||||
print(f" (metadata error: {metadata['error']})")
|
||||
return
|
||||
|
||||
# Render fields in a stable order
|
||||
field_order = [
|
||||
'domain_class',
|
||||
'primary_format',
|
||||
'structural_signals',
|
||||
'content_signals',
|
||||
'summary',
|
||||
]
|
||||
for field in field_order:
|
||||
if field in metadata:
|
||||
value = metadata[field]
|
||||
label = field.replace('_', ' ').title()
|
||||
if isinstance(value, list):
|
||||
if value:
|
||||
print(f" {label}:")
|
||||
for item in value:
|
||||
print(f" - {item}")
|
||||
else:
|
||||
print(f" {label}: (none)")
|
||||
elif isinstance(value, str):
|
||||
# Wrap long strings
|
||||
if len(value) > 70:
|
||||
print(f" {label}:")
|
||||
print(f" {value}")
|
||||
else:
|
||||
print(f" {label}: {value}")
|
||||
else:
|
||||
print(f" {label}: {value}")
|
||||
|
||||
# Show any other fields not in the standard order
|
||||
other_fields = [k for k in metadata.keys() if k not in field_order and k != 'char_length']
|
||||
for field in other_fields:
|
||||
value = metadata[field]
|
||||
label = field.replace('_', ' ').title()
|
||||
print(f" {label}: {value}")
|
||||
|
||||
def render_source(src, idx, total):
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f" Source {idx}/{total}")
|
||||
print("=" * 80)
|
||||
print(f"Name: {src['name']}")
|
||||
print(f"Length: {src['doc_chars']:,} chars")
|
||||
print()
|
||||
print("Mistral metadata:")
|
||||
print()
|
||||
render_metadata(src.get('metadata', {}))
|
||||
print()
|
||||
print("-" * 80)
|
||||
|
||||
def get_rating():
|
||||
while True:
|
||||
binary = input("Single-domain or multi-domain? [s/m/skip/quit]: ").strip().lower()
|
||||
if binary in ('s', 'm', 'skip', 'quit'):
|
||||
break
|
||||
print(" Please enter 's', 'm', 'skip', or 'quit'")
|
||||
|
||||
if binary == 'quit':
|
||||
return 'quit'
|
||||
if binary == 'skip':
|
||||
return None
|
||||
|
||||
while True:
|
||||
try:
|
||||
score_input = input("Purity score (1=many frames, 5=clearly single): ").strip()
|
||||
if score_input.lower() == 'quit':
|
||||
return 'quit'
|
||||
score = int(score_input)
|
||||
if 1 <= score <= 5:
|
||||
break
|
||||
print(" Score must be 1-5")
|
||||
except ValueError:
|
||||
print(" Please enter a number 1-5 (or 'quit')")
|
||||
|
||||
note = input("Optional note (Enter to skip): ").strip()
|
||||
|
||||
return {
|
||||
"binary": "single" if binary == 's' else "multi",
|
||||
"score": score,
|
||||
"note": note if note else None,
|
||||
}
|
||||
|
||||
def main():
|
||||
with open(E14_RESULTS) as f:
|
||||
e14 = json.load(f)
|
||||
|
||||
sources = [r for r in e14['results'] if 'submit_result' in r]
|
||||
rng = random.Random(42)
|
||||
shuffled = list(sources)
|
||||
rng.shuffle(shuffled)
|
||||
|
||||
state = load_existing()
|
||||
completed = set(state['completed_names'])
|
||||
remaining = [s for s in shuffled if s['name'] not in completed]
|
||||
|
||||
print(INTRO)
|
||||
print()
|
||||
print(f"Total sources: {len(sources)}")
|
||||
print(f"Already rated: {len(completed)}")
|
||||
print(f"Remaining: {len(remaining)}")
|
||||
print()
|
||||
if not remaining:
|
||||
print("All sources rated. Run analysis script next.")
|
||||
return
|
||||
|
||||
input("Press Enter to begin...")
|
||||
|
||||
try:
|
||||
for i, src in enumerate(remaining, start=len(completed) + 1):
|
||||
render_source(src, i, len(sources))
|
||||
try:
|
||||
rating = get_rating()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\n\nSaving and exiting...")
|
||||
save(state)
|
||||
return
|
||||
|
||||
if rating == 'quit':
|
||||
print("\nSaving and exiting...")
|
||||
save(state)
|
||||
return
|
||||
if rating is None:
|
||||
print(" Skipped")
|
||||
continue
|
||||
|
||||
rating['name'] = src['name']
|
||||
state['ratings'].append(rating)
|
||||
state['completed_names'].append(src['name'])
|
||||
save(state)
|
||||
print(f" Recorded: {rating['binary']}-domain, score={rating['score']}")
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f"Done. Rated {len(state['ratings'])} sources.")
|
||||
print(f"Saved to {RATINGS_OUT}")
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\n\nSaving...")
|
||||
save(state)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 metrics comparison — A (Tier 1 aaron) vs B (cascade aaron_cascade_test) on the 10 sample sources."""
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
COMPARISON_FILE = EXPERIMENTS / "cascade_reextract_comparison.json"
|
||||
|
||||
def query(group_id, cypher):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
def parse_int_result(output):
|
||||
"""Parse a single-integer result from redis-cli GRAPH.QUERY output."""
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
def parse_string_list(output):
|
||||
"""Parse a list of strings from redis-cli output (skipping headers and timing)."""
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
items = []
|
||||
started = False
|
||||
for line in lines:
|
||||
if line.startswith("Cached") or line.startswith("Query internal"):
|
||||
break
|
||||
if started:
|
||||
items.append(line)
|
||||
# The header is the column name; everything after is data
|
||||
# But we don't know column names a priori, so detect transition by length pattern
|
||||
if not started and len(line) < 60 and not any(c in line for c in "{}[]"):
|
||||
# Likely a header row, skip first one
|
||||
started = True
|
||||
return items
|
||||
|
||||
def metrics_for_source(group_id, source_name):
|
||||
"""Get metrics for one source's episode in one group_id."""
|
||||
# Total entities connected to this episode
|
||||
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity) RETURN count(distinct n) AS entities'
|
||||
entities = parse_int_result(query(group_id, q))
|
||||
|
||||
# Total edges from this episode (all relationship types)
|
||||
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[r]-() RETURN count(r) AS edges'
|
||||
edges = parse_int_result(query(group_id, q))
|
||||
|
||||
# Distinct relationship types in edges from entities of this episode
|
||||
q = (f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity)-[r]-() '
|
||||
f'RETURN count(distinct type(r)) AS types')
|
||||
rel_types = parse_int_result(query(group_id, q))
|
||||
|
||||
return {"entities": entities, "edges": edges, "rel_types": rel_types}
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
|
||||
print(f"E1 metrics comparison — {len(selected)} sources, A=aaron vs B=aaron_cascade_test\n")
|
||||
print(f"{'Source':<60} {'A.ent':>6} {'B.ent':>6} {'A.edg':>6} {'B.edg':>6} {'A.typ':>6} {'B.typ':>6}")
|
||||
print("-" * 110)
|
||||
|
||||
results = []
|
||||
for ep in selected:
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
a = metrics_for_source("aaron", name)
|
||||
b = metrics_for_source("aaron_cascade_test", name)
|
||||
record = {
|
||||
"name": name, "bucket": bucket,
|
||||
"a_entities": a["entities"], "b_entities": b["entities"],
|
||||
"a_edges": a["edges"], "b_edges": b["edges"],
|
||||
"a_rel_types": a["rel_types"], "b_rel_types": b["rel_types"],
|
||||
}
|
||||
results.append(record)
|
||||
# Truncate name for display
|
||||
display_name = name if len(name) <= 58 else name[:55] + "..."
|
||||
print(f"{display_name:<60} {a['entities']:>6} {b['entities']:>6} {a['edges']:>6} {b['edges']:>6} {a['rel_types']:>6} {b['rel_types']:>6}")
|
||||
|
||||
# Aggregates
|
||||
print("\n" + "=" * 110)
|
||||
n = len(results)
|
||||
a_ent_sum = sum(r["a_entities"] for r in results)
|
||||
b_ent_sum = sum(r["b_entities"] for r in results)
|
||||
a_edge_sum = sum(r["a_edges"] for r in results)
|
||||
b_edge_sum = sum(r["b_edges"] for r in results)
|
||||
a_types_sum = sum(r["a_rel_types"] for r in results)
|
||||
b_types_sum = sum(r["b_rel_types"] for r in results)
|
||||
print(f"\nAggregate (n={n}):")
|
||||
print(f" Entities: A mean={a_ent_sum/n:.1f} B mean={b_ent_sum/n:.1f} delta={(b_ent_sum-a_ent_sum)/a_ent_sum*100:+.1f}%")
|
||||
print(f" Edges: A mean={a_edge_sum/n:.1f} B mean={b_edge_sum/n:.1f} delta={(b_edge_sum-a_edge_sum)/a_edge_sum*100:+.1f}%")
|
||||
print(f" Rel types: A mean={a_types_sum/n:.1f} B mean={b_types_sum/n:.1f} delta={(b_types_sum-a_types_sum)/a_types_sum*100:+.1f}%")
|
||||
|
||||
# Global predicate diversity check (unique types in each group_id)
|
||||
print(f"\nGlobal predicate diversity:")
|
||||
a_global = parse_int_result(query("aaron", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
|
||||
b_global = parse_int_result(query("aaron_cascade_test", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
|
||||
print(f" A (aaron): {a_global} distinct relationship types across whole graph")
|
||||
print(f" B (aaron_cascade_test): {b_global} distinct relationship types across whole graph")
|
||||
|
||||
# Per-bucket
|
||||
print(f"\nPer-bucket aggregates:")
|
||||
for bucket in ["high", "mid", "low", "document"]:
|
||||
bucket_results = [r for r in results if r["bucket"] == bucket]
|
||||
if not bucket_results:
|
||||
continue
|
||||
bn = len(bucket_results)
|
||||
a_e = sum(r["a_entities"] for r in bucket_results) / bn
|
||||
b_e = sum(r["b_entities"] for r in bucket_results) / bn
|
||||
a_ed = sum(r["a_edges"] for r in bucket_results) / bn
|
||||
b_ed = sum(r["b_edges"] for r in bucket_results) / bn
|
||||
print(f" [{bucket:>8}] n={bn} A.ent={a_e:.1f} B.ent={b_e:.1f} ({(b_e-a_e)/a_e*100:+.0f}%) "
|
||||
f"A.edg={a_ed:.1f} B.edg={b_ed:.1f} ({(b_ed-a_ed)/a_ed*100:+.0f}%)")
|
||||
|
||||
with open(COMPARISON_FILE, "w") as f:
|
||||
json.dump({
|
||||
"results": results,
|
||||
"aggregate": {
|
||||
"a_entities_total": a_ent_sum, "b_entities_total": b_ent_sum,
|
||||
"a_edges_total": a_edge_sum, "b_edges_total": b_edge_sum,
|
||||
"global_predicate_diversity": {"a": a_global, "b": b_global},
|
||||
},
|
||||
}, f, indent=2)
|
||||
print(f"\nSaved to {COMPARISON_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 corrected metric — count distinct predicate names on edges originating from each episode."""
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
|
||||
def query(group_id, cypher):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
def get_episode_uuid(group_id, episode_name):
|
||||
"""Look up the UUID for a given episode name in a given group."""
|
||||
# Escape single quotes in the name
|
||||
safe = episode_name.replace("'", "\\'")
|
||||
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
|
||||
output = query(group_id, cypher)
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
# UUID format check
|
||||
if len(line) == 36 and line.count("-") == 4:
|
||||
return line
|
||||
return None
|
||||
|
||||
def count_predicates_for_episode(group_id, uuid):
|
||||
"""Count distinct predicate names on edges where this episode UUID appears in r.episodes."""
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
|
||||
output = query(group_id, cypher)
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
def count_total_edges_for_episode(group_id, uuid):
|
||||
"""Count total edges originating from this episode."""
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
|
||||
output = query(group_id, cypher)
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
|
||||
print(f"E1 corrected per-source comparison — predicates per episode by edge origin\n")
|
||||
print(f"{'Source':<60} {'A.edges':>8} {'A.preds':>8} {'B.edges':>8} {'B.preds':>8}")
|
||||
print("-" * 100)
|
||||
|
||||
a_pred_total = 0
|
||||
b_pred_total = 0
|
||||
a_edge_total = 0
|
||||
b_edge_total = 0
|
||||
records = []
|
||||
|
||||
for ep in selected:
|
||||
name = ep["name"]
|
||||
a_uuid = get_episode_uuid("aaron", name)
|
||||
b_uuid = get_episode_uuid("aaron_cascade_test", name)
|
||||
|
||||
a_edges = count_total_edges_for_episode("aaron", a_uuid) if a_uuid else 0
|
||||
a_preds = count_predicates_for_episode("aaron", a_uuid) if a_uuid else 0
|
||||
b_edges = count_total_edges_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
|
||||
b_preds = count_predicates_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
|
||||
|
||||
display = name if len(name) <= 58 else name[:55] + "..."
|
||||
print(f"{display:<60} {a_edges:>8} {a_preds:>8} {b_edges:>8} {b_preds:>8}")
|
||||
|
||||
records.append({
|
||||
"name": name, "bucket": ep["bucket"],
|
||||
"a_edges": a_edges, "a_preds": a_preds,
|
||||
"b_edges": b_edges, "b_preds": b_preds,
|
||||
})
|
||||
a_pred_total += a_preds
|
||||
b_pred_total += b_preds
|
||||
a_edge_total += a_edges
|
||||
b_edge_total += b_edges
|
||||
|
||||
print("-" * 100)
|
||||
n = len(selected)
|
||||
print(f"\nAggregate (n={n}):")
|
||||
print(f" Edges: A total={a_edge_total} mean={a_edge_total/n:.1f} B total={b_edge_total} mean={b_edge_total/n:.1f}")
|
||||
print(f" Predicates: A total={a_pred_total} mean={a_pred_total/n:.1f} B total={b_pred_total} mean={b_pred_total/n:.1f}")
|
||||
if a_pred_total > 0:
|
||||
print(f" Predicate delta: B vs A = {(b_pred_total-a_pred_total)/a_pred_total*100:+.1f}%")
|
||||
if a_edge_total > 0:
|
||||
print(f" Edge delta: B vs A = {(b_edge_total-a_edge_total)/a_edge_total*100:+.1f}%")
|
||||
|
||||
# Per-bucket
|
||||
print(f"\nPer-bucket:")
|
||||
for bucket in ["high", "mid", "low", "document"]:
|
||||
bucket_records = [r for r in records if r["bucket"] == bucket]
|
||||
if not bucket_records:
|
||||
continue
|
||||
bn = len(bucket_records)
|
||||
a_p = sum(r["a_preds"] for r in bucket_records)
|
||||
b_p = sum(r["b_preds"] for r in bucket_records)
|
||||
a_e = sum(r["a_edges"] for r in bucket_records)
|
||||
b_e = sum(r["b_edges"] for r in bucket_records)
|
||||
delta = ((b_p-a_p)/a_p*100) if a_p > 0 else 0
|
||||
print(f" [{bucket:>8}] n={bn} A.preds={a_p:>3} B.preds={b_p:>3} ({delta:+.0f}%) A.edges={a_e:>3} B.edges={b_e:>3}")
|
||||
|
||||
with open(EXPERIMENTS / "cascade_reextract_corrected_comparison.json", "w") as f:
|
||||
json.dump({"per_source": records,
|
||||
"aggregate": {"a_preds": a_pred_total, "b_preds": b_pred_total,
|
||||
"a_edges": a_edge_total, "b_edges": b_edge_total}}, f, indent=2)
|
||||
print(f"\nSaved to {EXPERIMENTS / 'cascade_reextract_corrected_comparison.json'}")
|
||||
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 orchestration — fetch source text, run Mistral metadata, submit to Graphiti test group_id."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_test"
|
||||
MAX_DOC_CHARS = 12000 # Same cap as Tier 1 for parity
|
||||
|
||||
# Stage 2 metadata prompt — verbatim from stage-2-worker-spec.md
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
"""Reassemble the full document from pgvector chunks, mirroring tier1_migration.py logic."""
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text):
|
||||
"""Call local Mistral via Ollama for base-class metadata."""
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=180,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
# Override char_length with python-computed value (per stage-2-worker-spec)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
"""Format the base-class metadata as a source_description for Graphiti, with orient-not-bound framing."""
|
||||
if "error" in metadata:
|
||||
return f"tier1_cascade_test (metadata generation failed: {metadata['error']})"
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode(name, content, source_description):
|
||||
"""Submit episode to Graphiti sidecar at the test group_id."""
|
||||
payload = {
|
||||
"episodes": [{
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": source_description,
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}],
|
||||
"group_id": TEST_GROUP_ID,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes/bulk", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
print(f"E1 cascade re-extraction starting — {len(selected)} episodes to test group_id={TEST_GROUP_ID}\n")
|
||||
|
||||
results = []
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
|
||||
# Fetch text
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
# Mistral metadata
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
# Submit to Graphiti
|
||||
source_desc = format_metadata_as_orientation(metadata)
|
||||
record["source_description"] = source_desc
|
||||
print(f" Submitting to Graphiti test group...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode(name, text, source_desc)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
# Save intermediate state after each episode
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 corrected re-run — cascade orientation passed via custom_extraction_instructions."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_test"
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text):
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=180,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
"""Format metadata as orient-not-bound extraction instructions."""
|
||||
if "error" in metadata:
|
||||
return None
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode_singular(name, content, custom_instructions):
|
||||
"""Submit episode to Graphiti's singular /episodes endpoint with cascade orientation."""
|
||||
payload = {
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": "e1_corrected_run", # neutral label, not the cascade text
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
"group_id": TEST_GROUP_ID,
|
||||
"custom_extraction_instructions": custom_instructions,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
print(f"E1 CORRECTED re-run — {len(selected)} episodes via /episodes (singular)")
|
||||
print(f"Cascade orientation passed in custom_extraction_instructions.\n")
|
||||
|
||||
results = []
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
custom_instructions = format_metadata_as_orientation(metadata)
|
||||
record["custom_extraction_instructions"] = custom_instructions
|
||||
print(f" Submitting via /episodes (singular) with custom_extraction_instructions...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode_singular(name, text, custom_instructions)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 sample selection — pick 10 episodes from Tier 1 stratified by density and type."""
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
OUTPUT = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
|
||||
# Get all Tier 1 episodes with their entity counts via FalkorDB
|
||||
def query_episode_counts():
|
||||
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
|
||||
"RETURN e.name AS name, count(distinct n) AS entities "
|
||||
"ORDER BY entities DESC")
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
# Parse the output — redis-cli returns rows after a header
|
||||
lines = [l for l in result.stdout.split("\n") if l.strip()]
|
||||
episodes = []
|
||||
# Skip header rows ("name", "entities") and timing rows
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if lines[i] == "name":
|
||||
i += 2 # skip "name" and "entities" headers
|
||||
continue
|
||||
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
|
||||
break
|
||||
# Each episode: name on one line, count on next
|
||||
if i + 1 < len(lines):
|
||||
try:
|
||||
count = int(lines[i + 1])
|
||||
episodes.append({"name": lines[i], "entities": count})
|
||||
i += 2
|
||||
except ValueError:
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
return episodes
|
||||
|
||||
print("Fetching episode entity counts from FalkorDB...")
|
||||
episodes = query_episode_counts()
|
||||
print(f"Got {len(episodes)} episodes")
|
||||
|
||||
# Classify by density bucket and type
|
||||
def is_document(name):
|
||||
doc_extensions = (".pdf", ".docx", ".pptx", ".txt", ".md")
|
||||
return any(name.lower().endswith(ext) for ext in doc_extensions)
|
||||
|
||||
# Compute quartile boundaries from the entity counts
|
||||
counts = sorted([e["entities"] for e in episodes], reverse=True)
|
||||
n = len(counts)
|
||||
top_q = counts[n // 4] # 25th percentile from top
|
||||
bottom_q = counts[3 * n // 4] # 75th percentile from top
|
||||
|
||||
print(f"\nQuartile boundaries: top={top_q}+, middle=({bottom_q+1}-{top_q-1}), bottom=0-{bottom_q}")
|
||||
|
||||
high = [e for e in episodes if e["entities"] >= top_q and not is_document(e["name"])]
|
||||
mid = [e for e in episodes if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
|
||||
low = [e for e in episodes if e["entities"] <= bottom_q and not is_document(e["name"])]
|
||||
docs = [e for e in episodes if is_document(e["name"]) and e["entities"] >= 5]
|
||||
|
||||
print(f"High-density conversations: {len(high)}")
|
||||
print(f"Mid-density conversations: {len(mid)}")
|
||||
print(f"Low-density conversations: {len(low)}")
|
||||
print(f"Documents (≥5 entities): {len(docs)}")
|
||||
|
||||
# Deterministic selection — take from middle of each bucket to avoid edge cases
|
||||
def pick(bucket, n):
|
||||
if len(bucket) < n:
|
||||
return bucket
|
||||
mid_idx = len(bucket) // 2
|
||||
start = max(0, mid_idx - n // 2)
|
||||
return bucket[start:start + n]
|
||||
|
||||
selected = (
|
||||
pick(high, 3) +
|
||||
pick(mid, 3) +
|
||||
pick(low, 2) +
|
||||
pick(docs, 2)
|
||||
)
|
||||
|
||||
# Tag each with its bucket
|
||||
def bucket_for(ep):
|
||||
if is_document(ep["name"]):
|
||||
return "document"
|
||||
if ep["entities"] >= top_q:
|
||||
return "high"
|
||||
if ep["entities"] > bottom_q:
|
||||
return "mid"
|
||||
return "low"
|
||||
|
||||
for ep in selected:
|
||||
ep["bucket"] = bucket_for(ep)
|
||||
|
||||
print(f"\nSelected {len(selected)} episodes for E1:")
|
||||
for ep in selected:
|
||||
print(f" [{ep['bucket']:>8}] {ep['entities']:>3}e {ep['name']}")
|
||||
|
||||
# Save selection
|
||||
with open(OUTPUT, "w") as f:
|
||||
json.dump({
|
||||
"metadata": {
|
||||
"purpose": "E1 cascade re-extraction sample (n=10)",
|
||||
"stratification": "density buckets + document subset",
|
||||
"quartile_top": top_q,
|
||||
"quartile_bottom": bottom_q,
|
||||
"total_tier1_episodes": len(episodes),
|
||||
},
|
||||
"selected": selected,
|
||||
}, f, indent=2)
|
||||
|
||||
print(f"\nSaved to {OUTPUT}")
|
||||
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2 follow-up: confirm Aaron AI alias situation, find other potential duplicates."""
|
||||
import subprocess
|
||||
|
||||
QUERIES = [
|
||||
("Aaron AI variants",
|
||||
"MATCH (n:Entity) WHERE n.name CONTAINS 'Aaron AI' OR n.name CONTAINS 'ARIN' OR n.name CONTAINS 'RNAI' RETURN n.name, n.summary"),
|
||||
("All Mossygear-named entities",
|
||||
"MATCH (n:Entity) WHERE n.name CONTAINS 'Mossy' OR n.name CONTAINS 'A+K' OR n.name CONTAINS 'AK Design' RETURN n.name, n.summary"),
|
||||
("Total entity count check",
|
||||
"MATCH (n:Entity) RETURN count(n) as total"),
|
||||
("Top 30 entity names by edge count",
|
||||
"MATCH (n:Entity)-[r]-() RETURN n.name, count(r) as edges ORDER BY edges DESC LIMIT 30"),
|
||||
]
|
||||
|
||||
for label, query in QUERIES:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"QUERY: {label}")
|
||||
print('=' * 60)
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
print(result.stdout)
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2: Entity resolution diagnostic. Queries Graphiti's FalkorDB for the six test entities."""
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
TEST_ENTITIES = ["Aaron", "Kat", "HVAMC", "Bird", "Susan Hamlet", "Tulsa album"]
|
||||
|
||||
def run_cypher(query):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
for name in TEST_ENTITIES:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"ENTITY: {name}")
|
||||
print('=' * 60)
|
||||
query = f"MATCH (n:Entity) WHERE n.name CONTAINS '{name}' RETURN n.name, n.summary"
|
||||
print(run_cypher(query))
|
||||
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2 follow-up: how many distinct episodes connect to each entity?"""
|
||||
import subprocess
|
||||
|
||||
QUERIES = [
|
||||
("Aaron", "MATCH (n:Entity {name: 'Aaron'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Nelson", "MATCH (n:Entity {name: 'Nelson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("HVAMC", "MATCH (n:Entity {name: 'HVAMC'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Bird", "MATCH (n:Entity {name: 'Bird'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Tulsa album", "MATCH (n:Entity {name: 'Tulsa album'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Susan Hamlet", "MATCH (n:Entity {name: 'Susan Hamlet'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Kat", "MATCH (n:Entity {name: 'Kat'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Katherine Wilson","MATCH (n:Entity {name: 'Katherine Wilson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
]
|
||||
|
||||
for label, query in QUERIES:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"ENTITY: {label}")
|
||||
print('=' * 60)
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
print(result.stdout)
|
||||
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E1.8 Phase 2 — Evaluate
|
||||
Pulls predicate counts from FalkorDB for each group_id and compares.
|
||||
Run after e1_8_taxfree_cascade.py completes.
|
||||
"""
|
||||
|
||||
import json, subprocess
|
||||
from pathlib import Path
|
||||
|
||||
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
||||
EVAL_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
|
||||
|
||||
GROUP_TAXFREE = "aaron_e18_taxfree"
|
||||
GROUP_BASELINE = "aaron_e18_baseline"
|
||||
GROUP_STANDARD = "aaron_e18_standard"
|
||||
GROUP_PROD = "aaron"
|
||||
GROUP_E14 = "aaron_cascade_e14"
|
||||
|
||||
|
||||
def query(group_id, cypher):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
|
||||
def get_episode_uuid(group_id, episode_name):
|
||||
safe = episode_name.replace("'", "\'")
|
||||
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
|
||||
output = query(group_id, cypher)
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if len(line) == 36 and line.count("-") == 4:
|
||||
return line
|
||||
return None
|
||||
|
||||
|
||||
def count_preds(group_id, uuid):
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
|
||||
output = query(group_id, cypher)
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
|
||||
def count_edges(group_id, uuid):
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
|
||||
output = query(group_id, cypher)
|
||||
for line in output.split("\n"):
|
||||
line = line.strip()
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
|
||||
def eval_source(name, groups):
|
||||
result = {"name": name}
|
||||
for label, group_id in groups.items():
|
||||
uuid = get_episode_uuid(group_id, name)
|
||||
if uuid:
|
||||
result[f"{label}_preds"] = count_preds(group_id, uuid)
|
||||
result[f"{label}_edges"] = count_edges(group_id, uuid)
|
||||
else:
|
||||
result[f"{label}_preds"] = None
|
||||
result[f"{label}_edges"] = None
|
||||
return result
|
||||
|
||||
|
||||
def run():
|
||||
print("E1.8 — Evaluation phase")
|
||||
print("=" * 60)
|
||||
|
||||
results = json.loads(RESULTS_PATH.read_text())
|
||||
eval_results = {"subsample_a": [], "subsample_b": []}
|
||||
|
||||
# Sub-sample A — compare taxfree vs prod (baseline) vs e14 cascade
|
||||
print("\nSub-sample A")
|
||||
print(f"{'Source':<55} {'prod':>5} {'e14c':>5} {'tf':>5} {'e14Δ':>6} {'tfΔ':>6}")
|
||||
print("-" * 90)
|
||||
|
||||
a_records = []
|
||||
for item in results["subsample_a"]:
|
||||
name = item["name"]
|
||||
r = eval_source(name, {
|
||||
"prod": GROUP_PROD,
|
||||
"e14": GROUP_E14,
|
||||
"tf": GROUP_TAXFREE,
|
||||
})
|
||||
r["bucket"] = item["bucket"]
|
||||
r["taxfree_metadata"] = item.get("taxfree_metadata")
|
||||
r["e14_delta_preds"] = item.get("e14_delta_preds")
|
||||
|
||||
prod = r.get("prod_preds") or 0
|
||||
e14 = r.get("e14_preds") or 0
|
||||
tf = r.get("tf_preds") or 0
|
||||
e14_delta = ((e14 - prod) / prod * 100) if prod > 0 else 0
|
||||
tf_delta = ((tf - prod) / prod * 100) if prod > 0 else 0
|
||||
|
||||
display = name[:53] + ".." if len(name) > 55 else name
|
||||
print(f"{display:<55} {prod:>5} {e14:>5} {tf:>5} {e14_delta:>+5.0f}% {tf_delta:>+5.0f}%")
|
||||
|
||||
r["tf_delta_vs_prod"] = tf_delta
|
||||
r["e14_delta_vs_prod"] = e14_delta
|
||||
a_records.append(r)
|
||||
eval_results["subsample_a"].append(r)
|
||||
|
||||
# Aggregate Sub-sample A
|
||||
valid = [r for r in a_records if r.get("prod_preds") and r.get("tf_preds")]
|
||||
if valid:
|
||||
mean_e14_delta = sum(r["e14_delta_vs_prod"] for r in valid) / len(valid)
|
||||
mean_tf_delta = sum(r["tf_delta_vs_prod"] for r in valid) / len(valid)
|
||||
print(f"\nAggregate Sub-sample A (n={len(valid)}):")
|
||||
print(f" E1.4 cascade mean delta vs prod: {mean_e14_delta:+.1f}%")
|
||||
print(f" Taxonomy-free mean delta vs prod: {mean_tf_delta:+.1f}%")
|
||||
print(f" Taxonomy-free vs E1.4 cascade: {mean_tf_delta - mean_e14_delta:+.1f}pp")
|
||||
|
||||
# Sub-sample B — all three conditions
|
||||
print("\n\nSub-sample B")
|
||||
print(f"{'Source':<55} {'base':>5} {'std':>5} {'tf':>5} {'stdΔ':>6} {'tfΔ':>6}")
|
||||
print("-" * 90)
|
||||
|
||||
b_records = []
|
||||
for item in results["subsample_b"]:
|
||||
name = item["name"]
|
||||
r = eval_source(name, {
|
||||
"base": GROUP_BASELINE,
|
||||
"std": GROUP_STANDARD,
|
||||
"tf": GROUP_TAXFREE,
|
||||
})
|
||||
r["bucket"] = item["bucket"]
|
||||
r["taxfree_metadata"] = item.get("taxfree_metadata")
|
||||
r["standard_metadata"] = item.get("standard_metadata")
|
||||
|
||||
base = r.get("base_preds") or 0
|
||||
std = r.get("std_preds") or 0
|
||||
tf = r.get("tf_preds") or 0
|
||||
std_delta = ((std - base) / base * 100) if base > 0 else 0
|
||||
tf_delta = ((tf - base) / base * 100) if base > 0 else 0
|
||||
|
||||
display = name[:53] + ".." if len(name) > 55 else name
|
||||
print(f"{display:<55} {base:>5} {std:>5} {tf:>5} {std_delta:>+5.0f}% {tf_delta:>+5.0f}%")
|
||||
|
||||
r["std_delta_vs_base"] = std_delta
|
||||
r["tf_delta_vs_base"] = tf_delta
|
||||
b_records.append(r)
|
||||
eval_results["subsample_b"].append(r)
|
||||
|
||||
# Aggregate Sub-sample B
|
||||
valid_b = [r for r in b_records if r.get("base_preds") and r.get("tf_preds")]
|
||||
if valid_b:
|
||||
mean_std_delta = sum(r["std_delta_vs_base"] for r in valid_b) / len(valid_b)
|
||||
mean_tf_delta = sum(r["tf_delta_vs_base"] for r in valid_b) / len(valid_b)
|
||||
print(f"\nAggregate Sub-sample B (n={len(valid_b)}):")
|
||||
print(f" Standard cascade mean delta vs baseline: {mean_std_delta:+.1f}%")
|
||||
print(f" Taxonomy-free mean delta vs baseline: {mean_tf_delta:+.1f}%")
|
||||
|
||||
# By bucket
|
||||
print("\nPer-bucket (Sub-sample B):")
|
||||
for bucket in ["high", "mid", "document"]:
|
||||
br = [r for r in valid_b if r["bucket"] == bucket]
|
||||
if not br:
|
||||
continue
|
||||
m_std = sum(r["std_delta_vs_base"] for r in br) / len(br)
|
||||
m_tf = sum(r["tf_delta_vs_base"] for r in br) / len(br)
|
||||
print(f" [{bucket:>8}] n={len(br)} std={m_std:+.0f}% tf={m_tf:+.0f}%")
|
||||
|
||||
# Decision rule evaluation
|
||||
print("\n" + "=" * 60)
|
||||
print("DECISION RULE:")
|
||||
if valid:
|
||||
improvement = mean_tf_delta - mean_e14_delta
|
||||
if improvement >= 20:
|
||||
print(f" ✓ STRONG RECOVERY (+{improvement:.1f}pp) — Stage 3.1 ships as taxonomy-free")
|
||||
elif improvement >= 5:
|
||||
print(f" ~ PARTIAL RECOVERY (+{improvement:.1f}pp) — orientation helps, needs refinement")
|
||||
elif improvement >= 0:
|
||||
print(f" ~ MARGINAL (+{improvement:.1f}pp) — consider API extractor prompt redesign (E1.9)")
|
||||
else:
|
||||
print(f" ✗ NEGATIVE ({improvement:.1f}pp) — taxonomy-free introduces more noise than standard")
|
||||
|
||||
EVAL_PATH.write_text(json.dumps(eval_results, indent=2))
|
||||
print(f"\nEval saved to {EVAL_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
@@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E1.8 Phase 1 — Ingest
|
||||
Runs taxonomy-free and standard cascade ingestion for Sub-samples A and B.
|
||||
Run this first, then run e1_8_eval.py to pull predicate counts.
|
||||
"""
|
||||
|
||||
import os, json, time, psycopg2, requests
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
||||
|
||||
GROUP_TAXFREE = "aaron_e18_taxfree"
|
||||
GROUP_BASELINE = "aaron_e18_baseline"
|
||||
GROUP_STANDARD = "aaron_e18_standard"
|
||||
|
||||
TAXFREE_PROMPT = """You are a metadata extraction system. Given a document, describe its content shape for use as orientation context in a knowledge graph extraction pass.
|
||||
|
||||
Do not summarize content. Do not extract entities. Do not assign a single category label.
|
||||
|
||||
Instead, describe:
|
||||
- What domains or frames are active in this content (there may be several simultaneously)
|
||||
- How those frames relate to each other in this specific document
|
||||
- What kind of relational content a knowledge graph extractor should look for
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"active_frames": ["<frame 1>", "<frame 2>", ...],
|
||||
"frame_relationships": "<one sentence describing how the frames interact in this document>",
|
||||
"extraction_orientation": "<one sentence orienting the extractor toward the most relationship-rich content>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
STANDARD_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
SUBSAMPLE_A = [
|
||||
{"name": "Claude: Lubbock on everything album lyrics", "bucket": "high"},
|
||||
{"name": "ChatGPT: Tulsa Concept Album Guide", "bucket": "high"},
|
||||
{"name": "ChatGPT: Rhino 3D object flow", "bucket": "high"},
|
||||
{"name": "Claude: SUNY faculty conflict of interest policies", "bucket": "mid"},
|
||||
{"name": "Claude: Interview presentation research and preparation", "bucket": "mid"},
|
||||
{"name": "Claude: Research Statement Restructure", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Respect Individual Interests for Christmas", "bucket": "low"},
|
||||
{"name": "University of North Texas Cover letter.pdf", "bucket": "document"},
|
||||
{"name": "Claude: Finding ideal rural housing near University of Utah", "bucket": "high"},
|
||||
{"name": "ChatGPT: SEC coaches with OSU ties", "bucket": "high"},
|
||||
{"name": "Claude: Bonding ASA 3D printed parts", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Title: User request summary.", "bucket": "low"},
|
||||
{"name": "ChatGPT: Scholarship Recommendation Letter Tips", "bucket": "low"},
|
||||
]
|
||||
|
||||
SUBSAMPLE_B = [
|
||||
{"name": "ChatGPT: Job application comparison", "bucket": "high"},
|
||||
{"name": "ChatGPT: External review for tenure", "bucket": "high"},
|
||||
{"name": "Claude: University of Utah interview teaching example", "bucket": "high"},
|
||||
{"name": "ChatGPT: Starting Dropship Gun Business", "bucket": "high"},
|
||||
{"name": "ChatGPT: Analyze business plan", "bucket": "high"},
|
||||
{"name": "ChatGPT: Outdoor Layering Explained", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Limits in Calculus.", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Academic Program Director Role", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Lonely Island Poop Skit", "bucket": "mid"},
|
||||
{"name": "ChatGPT: Parse Tidal playlist", "bucket": "mid"},
|
||||
{"name": "NO thesis proposal.pdf", "bucket": "document"},
|
||||
{"name": "PWM.pdf", "bucket": "document"},
|
||||
{"name": "Will_It_Print.pdf", "bucket": "document"},
|
||||
{"name": "Kim Kedem Ind Study F2025 Syllabus.docx", "bucket": "document"},
|
||||
{"name": "Aaron Nelson Graduate Transcript.pdf", "bucket": "document"},
|
||||
]
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def get_document_text(source_name):
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
cur.execute("SELECT document FROM embeddings WHERE source = %s ORDER BY id LIMIT 20", (source_name,))
|
||||
rows = cur.fetchall()
|
||||
pg.close()
|
||||
return " ".join(r[0] for r in rows)[:12000]
|
||||
|
||||
|
||||
def run_mistral(prompt_prefix, doc_text, label=""):
|
||||
print(f" → Mistral {label} running...", flush=True)
|
||||
payload = {"model": "mistral:latest", "prompt": prompt_prefix + doc_text, "stream": False, "format": "json"}
|
||||
resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=300)
|
||||
resp.raise_for_status()
|
||||
raw = resp.json().get("response", "{}")
|
||||
print(f" → Mistral {label} done ({len(raw)} chars)", flush=True)
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return {"error": "parse_failed", "raw": raw[:200]}
|
||||
|
||||
|
||||
def build_taxfree_orientation(meta):
|
||||
frames = ", ".join(meta.get("active_frames", []))
|
||||
rel = meta.get("frame_relationships", "")
|
||||
orient = meta.get("extraction_orientation", "")
|
||||
summary = meta.get("one_sentence_summary", "")
|
||||
return f"Active frames: {frames}. Frame relationships: {rel} Extraction focus: {orient} Summary: {summary}"
|
||||
|
||||
|
||||
def build_standard_orientation(meta):
|
||||
dc = meta.get("domain_class", "unknown")
|
||||
pf = meta.get("primary_format", "unknown")
|
||||
summary = meta.get("one_sentence_summary", "")
|
||||
cs = meta.get("content_signals", {})
|
||||
return (f"domain_class: {dc}\nprimary_format: {pf}\none_sentence_summary: {summary}\n"
|
||||
f"has_named_people: {cs.get('has_named_people', False)}\n"
|
||||
f"has_technical_terminology: {cs.get('has_technical_terminology', False)}")
|
||||
|
||||
|
||||
def ingest(source_name, doc_text, orientation, group_id):
|
||||
payload = {
|
||||
"episodes": [{
|
||||
"name": source_name,
|
||||
"content": doc_text[:12000],
|
||||
"source_description": orientation,
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}],
|
||||
"group_id": group_id,
|
||||
}
|
||||
resp = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=300)
|
||||
resp.raise_for_status()
|
||||
|
||||
|
||||
def save(results):
|
||||
RESULTS_PATH.write_text(json.dumps(results, indent=2))
|
||||
|
||||
|
||||
def run():
|
||||
print("E1.8 — Ingest phase")
|
||||
print("=" * 60)
|
||||
|
||||
# Load existing results if resuming
|
||||
if RESULTS_PATH.exists():
|
||||
results = json.loads(RESULTS_PATH.read_text())
|
||||
done_a = {r["name"] for r in results.get("subsample_a", [])}
|
||||
done_b = {r["name"] for r in results.get("subsample_b", [])}
|
||||
print(f"Resuming: {len(done_a)} A done, {len(done_b)} B done")
|
||||
else:
|
||||
results = {"subsample_a": [], "subsample_b": []}
|
||||
done_a, done_b = set(), set()
|
||||
|
||||
e14_data = json.loads((Path.home() / "aaronai" / "experiments" / "e14_per_source_comparison.json").read_text())
|
||||
e14_by_name = {s["name"]: s for s in e14_data}
|
||||
|
||||
# Sub-sample A — taxonomy-free only (baseline + standard from E1.4)
|
||||
print("\nSub-sample A — taxonomy-free ingestion only")
|
||||
for item in SUBSAMPLE_A:
|
||||
name = item["name"]
|
||||
if name in done_a:
|
||||
print(f" SKIP (done): {name}")
|
||||
continue
|
||||
print(f"\n {name}")
|
||||
doc_text = get_document_text(name)
|
||||
if not doc_text:
|
||||
print(f" SKIP — no text")
|
||||
continue
|
||||
|
||||
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
|
||||
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
|
||||
orientation = build_taxfree_orientation(tf_meta)
|
||||
|
||||
try:
|
||||
ingest(name, doc_text, orientation, GROUP_TAXFREE)
|
||||
time.sleep(3)
|
||||
print(f" ingested to {GROUP_TAXFREE}")
|
||||
except Exception as e:
|
||||
print(f" ingest failed: {e}")
|
||||
continue
|
||||
|
||||
e14 = e14_by_name.get(name, {})
|
||||
results["subsample_a"].append({
|
||||
"name": name,
|
||||
"bucket": item["bucket"],
|
||||
"taxfree_metadata": tf_meta,
|
||||
"taxfree_orientation": orientation,
|
||||
"e14_prod_preds": e14.get("prod_preds"),
|
||||
"e14_cascade_preds": e14.get("cascade_preds"),
|
||||
"e14_delta_preds": e14.get("delta_preds"),
|
||||
"e14_prod_edges": e14.get("prod_edges"),
|
||||
"e14_cascade_edges": e14.get("cascade_edges"),
|
||||
"e14_delta_edges": e14.get("delta_edges"),
|
||||
})
|
||||
save(results)
|
||||
|
||||
# Sub-sample B — all three conditions
|
||||
print("\nSub-sample B — all three conditions")
|
||||
for item in SUBSAMPLE_B:
|
||||
name = item["name"]
|
||||
if name in done_b:
|
||||
print(f" SKIP (done): {name}")
|
||||
continue
|
||||
print(f"\n {name} ({item['bucket']})")
|
||||
doc_text = get_document_text(name)
|
||||
if not doc_text:
|
||||
print(f" SKIP — no text")
|
||||
continue
|
||||
|
||||
entry = {"name": name, "bucket": item["bucket"],
|
||||
"taxfree_metadata": None, "standard_metadata": None}
|
||||
|
||||
# Baseline
|
||||
try:
|
||||
ingest(name, doc_text, "", GROUP_BASELINE)
|
||||
time.sleep(3)
|
||||
print(f" baseline ingested")
|
||||
except Exception as e:
|
||||
print(f" baseline failed: {e}")
|
||||
|
||||
# Standard
|
||||
std_meta = run_mistral(STANDARD_PROMPT, doc_text, "standard")
|
||||
entry["standard_metadata"] = std_meta
|
||||
try:
|
||||
ingest(name, doc_text, build_standard_orientation(std_meta), GROUP_STANDARD)
|
||||
time.sleep(3)
|
||||
print(f" standard ingested, domain_class={std_meta.get('domain_class','?')}")
|
||||
except Exception as e:
|
||||
print(f" standard failed: {e}")
|
||||
|
||||
# Taxonomy-free
|
||||
tf_meta = run_mistral(TAXFREE_PROMPT, doc_text, "taxfree")
|
||||
entry["taxfree_metadata"] = tf_meta
|
||||
print(f" frames: {tf_meta.get('active_frames', 'ERROR')}")
|
||||
try:
|
||||
ingest(name, doc_text, build_taxfree_orientation(tf_meta), GROUP_TAXFREE)
|
||||
time.sleep(3)
|
||||
print(f" taxfree ingested")
|
||||
except Exception as e:
|
||||
print(f" taxfree failed: {e}")
|
||||
|
||||
results["subsample_b"].append(entry)
|
||||
save(results)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Ingest complete. Results at {RESULTS_PATH}")
|
||||
print("Now run: python3 ~/aaronai/scripts/experiments/e1_8_eval.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E1.9 Phase 1 — Retroactive validation
|
||||
For each E1.8 source, query the production graph with frame_relationships
|
||||
to get a coverage score, then check whether the routing tier prediction
|
||||
matches the actual best-performing condition from E1.8.
|
||||
No API spend required — uses existing E1.8 data and Graphiti search only.
|
||||
"""
|
||||
|
||||
import json, requests
|
||||
from pathlib import Path
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
E18_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_eval.json"
|
||||
E18_INGEST_PATH = Path.home() / "aaronai" / "experiments" / "e1_8_results.json"
|
||||
RESULTS_PATH = Path.home() / "aaronai" / "experiments" / "e1_9_retroactive.json"
|
||||
|
||||
# Routing thresholds
|
||||
HIGH_THRESHOLD = 0.70 # baseline
|
||||
LOW_THRESHOLD = 0.40 # taxonomy-free
|
||||
|
||||
|
||||
def get_coverage_score(query, group_id="aaron"):
|
||||
"""Query production graph and return coverage score based on result count.
|
||||
Score: 0 = no results, 0.33 = 1 result, 0.66 = 2 results, 1.0 = 3+ results.
|
||||
Uses result count because Graphiti fulltext search returns score=0 for all hits.
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
return 0.0
|
||||
try:
|
||||
resp = requests.get(
|
||||
f"{GRAPHITI_URL}/search",
|
||||
params={"query": query, "limit": 3, "group_id": group_id},
|
||||
timeout=30
|
||||
)
|
||||
resp.raise_for_status()
|
||||
results = resp.json().get("results", [])
|
||||
n = len(results)
|
||||
return min(n / 3.0, 1.0)
|
||||
except Exception as e:
|
||||
print(f" Search error: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def assign_tier(coverage_score):
|
||||
if coverage_score >= HIGH_THRESHOLD:
|
||||
return "baseline"
|
||||
elif coverage_score >= LOW_THRESHOLD:
|
||||
return "standard"
|
||||
else:
|
||||
return "taxfree"
|
||||
|
||||
|
||||
def best_condition_from_e18(record, subsample):
|
||||
"""
|
||||
Determine which condition actually performed best for this source in E1.8.
|
||||
Sub-sample A: compare prod (baseline), e14 (standard cascade), tf (taxfree)
|
||||
Sub-sample B: compare base, std, tf
|
||||
"""
|
||||
if subsample == "a":
|
||||
prod = record.get("prod_preds") or 0
|
||||
e14 = record.get("e14_preds") or 0
|
||||
tf = record.get("tf_preds") or 0
|
||||
best_score = max(prod, e14, tf)
|
||||
if best_score == 0:
|
||||
return "unknown"
|
||||
if tf == best_score:
|
||||
return "taxfree"
|
||||
elif e14 == best_score:
|
||||
return "standard"
|
||||
else:
|
||||
return "baseline"
|
||||
else:
|
||||
base = record.get("base_preds") or 0
|
||||
std = record.get("std_preds") or 0
|
||||
tf = record.get("tf_preds") or 0
|
||||
best_score = max(base, std, tf)
|
||||
if best_score == 0:
|
||||
return "unknown"
|
||||
if tf == best_score:
|
||||
return "taxfree"
|
||||
elif std == best_score:
|
||||
return "standard"
|
||||
else:
|
||||
return "baseline"
|
||||
|
||||
|
||||
def run():
|
||||
print("E1.9 Phase 1 — Retroactive validation")
|
||||
print("=" * 60)
|
||||
|
||||
e18_eval = json.loads(E18_PATH.read_text())
|
||||
e18_ingest = json.loads(E18_INGEST_PATH.read_text())
|
||||
|
||||
# Build frame_relationships lookup from ingest results
|
||||
fr_lookup = {}
|
||||
for item in e18_ingest.get("subsample_a", []):
|
||||
meta = item.get("taxfree_metadata", {})
|
||||
if meta:
|
||||
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
|
||||
for item in e18_ingest.get("subsample_b", []):
|
||||
meta = item.get("taxfree_metadata", {})
|
||||
if meta:
|
||||
fr_lookup[item["name"]] = meta.get("frame_relationships", "")
|
||||
|
||||
results = []
|
||||
correct = 0
|
||||
total = 0
|
||||
|
||||
# Sub-sample A
|
||||
print("\nSub-sample A")
|
||||
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
|
||||
print("-" * 95)
|
||||
|
||||
for record in e18_eval["subsample_a"]:
|
||||
name = record["name"]
|
||||
fr = fr_lookup.get(name, "")
|
||||
coverage = get_coverage_score(fr)
|
||||
tier = assign_tier(coverage)
|
||||
actual_best = best_condition_from_e18(record, "a")
|
||||
match = "✓" if tier == actual_best else "✗"
|
||||
if actual_best != "unknown":
|
||||
total += 1
|
||||
if tier == actual_best:
|
||||
correct += 1
|
||||
display = name[:48] + ".." if len(name) > 50 else name
|
||||
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
|
||||
results.append({
|
||||
"name": name, "subsample": "a", "bucket": record.get("bucket"),
|
||||
"frame_relationships": fr, "coverage_score": coverage,
|
||||
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
|
||||
})
|
||||
|
||||
# Sub-sample B
|
||||
print("\nSub-sample B")
|
||||
print(f"{'Source':<50} {'cov':>5} {'tier':<10} {'predicted':<10} {'actual':<10} {'match'}")
|
||||
print("-" * 95)
|
||||
|
||||
for record in e18_eval["subsample_b"]:
|
||||
name = record["name"]
|
||||
fr = fr_lookup.get(name, "")
|
||||
coverage = get_coverage_score(fr)
|
||||
tier = assign_tier(coverage)
|
||||
actual_best = best_condition_from_e18(record, "b")
|
||||
match = "✓" if tier == actual_best else "✗"
|
||||
if actual_best != "unknown":
|
||||
total += 1
|
||||
if tier == actual_best:
|
||||
correct += 1
|
||||
display = name[:48] + ".." if len(name) > 50 else name
|
||||
print(f"{display:<50} {coverage:>5.2f} {tier:<10} {tier:<10} {actual_best:<10} {match}")
|
||||
results.append({
|
||||
"name": name, "subsample": "b", "bucket": record.get("bucket"),
|
||||
"frame_relationships": fr, "coverage_score": coverage,
|
||||
"predicted_tier": tier, "actual_best": actual_best, "match": tier == actual_best,
|
||||
})
|
||||
|
||||
# Summary
|
||||
rate = correct / total * 100 if total > 0 else 0
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Validation rate: {correct}/{total} ({rate:.1f}%)")
|
||||
print()
|
||||
if rate >= 70:
|
||||
print("✓ SIGNAL VALIDATED — coverage score predicts best condition")
|
||||
print(" Proceed to Phase 2 (new ingestion with routing)")
|
||||
elif rate >= 50:
|
||||
print("~ MARGINAL — adjust thresholds before Phase 2")
|
||||
print(" Review mismatch patterns below")
|
||||
else:
|
||||
print("✗ SIGNAL NOT PREDICTIVE — frame_relationships coverage")
|
||||
print(" may not be the right signal. Consider active_frames fallback.")
|
||||
|
||||
# Mismatch analysis
|
||||
mismatches = [r for r in results if not r["match"] and r["actual_best"] != "unknown"]
|
||||
if mismatches:
|
||||
print(f"\nMismatches ({len(mismatches)}):")
|
||||
for r in mismatches:
|
||||
print(f" [{r['bucket']:<8}] cov={r['coverage_score']:.2f} predicted={r['predicted_tier']} actual={r['actual_best']} | {r['name'][:50]}")
|
||||
|
||||
# Coverage score distribution
|
||||
scores = [r["coverage_score"] for r in results]
|
||||
print(f"\nCoverage score distribution:")
|
||||
print(f" Mean: {sum(scores)/len(scores):.2f}")
|
||||
print(f" Min: {min(scores):.2f}")
|
||||
print(f" Max: {max(scores):.2f}")
|
||||
high = sum(1 for s in scores if s >= HIGH_THRESHOLD)
|
||||
mid = sum(1 for s in scores if LOW_THRESHOLD <= s < HIGH_THRESHOLD)
|
||||
low = sum(1 for s in scores if s < LOW_THRESHOLD)
|
||||
print(f" Tier distribution: baseline={high} standard={mid} taxfree={low}")
|
||||
|
||||
# Save
|
||||
output = {
|
||||
"validation_rate": rate,
|
||||
"correct": correct,
|
||||
"total": total,
|
||||
"thresholds": {"high": HIGH_THRESHOLD, "low": LOW_THRESHOLD},
|
||||
"results": results,
|
||||
}
|
||||
RESULTS_PATH.write_text(json.dumps(output, indent=2))
|
||||
print(f"\nSaved to {RESULTS_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Experiment 005 — Actual API Token Measurement
|
||||
|
||||
Measures input token reduction from prepending v2 briefing vs raw document
|
||||
on Claude Haiku, validating the 42.0% modeled estimate from Experiment 002b.
|
||||
|
||||
Outputs: ~/aaronai/experiments/token_measurement_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
INPUT_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "token_measurement_results.json"
|
||||
MODEL = "claude-haiku-4-5-20251001"
|
||||
MAX_TOKENS = 1024
|
||||
|
||||
EXTRACTION_PROMPT = (
|
||||
"Extract entities and their relationships from the document below. "
|
||||
"Return ONLY valid JSON with this schema:\n"
|
||||
"{\n"
|
||||
' "people": [string],\n'
|
||||
' "organizations": [string],\n'
|
||||
' "locations": [string],\n'
|
||||
' "dates": [string],\n'
|
||||
' "relationships": [{"subject": string, "predicate": string, "object": string}]\n'
|
||||
"}\n"
|
||||
"No prose, no markdown fences, no commentary. JSON only."
|
||||
)
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
"""Reconstruct the document by concatenating its chunks from pgvector."""
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None
|
||||
return "\n\n".join(r[0] for r in rows)
|
||||
|
||||
|
||||
def build_raw_message(document_text):
|
||||
return f"{EXTRACTION_PROMPT}\n\nDOCUMENT:\n{document_text}"
|
||||
|
||||
|
||||
def build_briefed_message(briefing, document_text):
|
||||
briefing_str = json.dumps(briefing, indent=2)
|
||||
return (
|
||||
f"{EXTRACTION_PROMPT}\n\n"
|
||||
f"BRIEFING (pre-analysis from local model — use to orient):\n{briefing_str}\n\n"
|
||||
f"DOCUMENT:\n{document_text}"
|
||||
)
|
||||
|
||||
|
||||
def call_haiku(client, message_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=MODEL,
|
||||
max_tokens=MAX_TOKENS,
|
||||
messages=[{"role": "user", "content": message_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def ci_95(values):
|
||||
if len(values) < 2:
|
||||
return (statistics.mean(values) if values else 0.0, 0.0)
|
||||
mean = statistics.mean(values)
|
||||
half = 1.96 * statistics.stdev(values) / (len(values) ** 0.5)
|
||||
return (mean, half)
|
||||
|
||||
|
||||
def main():
|
||||
if not INPUT_FILE.exists():
|
||||
print(f"ERROR: {INPUT_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not pg_dsn:
|
||||
print("ERROR: PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
with open(INPUT_FILE) as f:
|
||||
v2_data = json.load(f)
|
||||
|
||||
docs_meta = [
|
||||
d for d in v2_data["documents"]
|
||||
if d.get("status") == "SUCCESS"
|
||||
and d.get("briefing")
|
||||
]
|
||||
|
||||
print(f"Loaded {len(docs_meta)} successful briefings from {INPUT_FILE.name}")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"Calls planned: up to {len(docs_meta) * 2}\n")
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc in enumerate(docs_meta, 1):
|
||||
source = doc["source"]
|
||||
briefing = doc["briefing"]
|
||||
|
||||
document_text = fetch_document_text(pg_conn, source)
|
||||
if not document_text:
|
||||
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]} -- SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]}")
|
||||
|
||||
try:
|
||||
raw_result = call_haiku(client, build_raw_message(document_text))
|
||||
except Exception as e:
|
||||
print(f" RAW FAILED: {e}")
|
||||
raw_result = {"error": str(e)}
|
||||
|
||||
try:
|
||||
briefed_result = call_haiku(client, build_briefed_message(briefing, document_text))
|
||||
except Exception as e:
|
||||
print(f" BRIEFED FAILED: {e}")
|
||||
briefed_result = {"error": str(e)}
|
||||
|
||||
delta = None
|
||||
if "input_tokens" in raw_result and "input_tokens" in briefed_result:
|
||||
raw_in = raw_result["input_tokens"]
|
||||
briefed_in = briefed_result["input_tokens"]
|
||||
raw_out = raw_result["output_tokens"]
|
||||
briefed_out = briefed_result["output_tokens"]
|
||||
input_red = (raw_in - briefed_in) / raw_in * 100 if raw_in else 0.0
|
||||
output_delta = (briefed_out - raw_out) / raw_out * 100 if raw_out else 0.0
|
||||
delta = {
|
||||
"input_reduction_pct": round(input_red, 2),
|
||||
"output_delta_pct": round(output_delta, 2),
|
||||
"raw_input_tokens": raw_in,
|
||||
"briefed_input_tokens": briefed_in,
|
||||
"raw_output_tokens": raw_out,
|
||||
"briefed_output_tokens": briefed_out,
|
||||
}
|
||||
print(
|
||||
f" in: {raw_in} -> {briefed_in} ({input_red:+.1f}%) | "
|
||||
f"out: {raw_out} -> {briefed_out}"
|
||||
)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"raw": raw_result,
|
||||
"briefed": briefed_result,
|
||||
"delta": delta,
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results if r.get("delta") is not None]
|
||||
skipped = [r for r in results if r.get("skipped")]
|
||||
reductions = [r["delta"]["input_reduction_pct"] for r in valid]
|
||||
output_deltas = [r["delta"]["output_delta_pct"] for r in valid]
|
||||
raw_in_total = sum(r["delta"]["raw_input_tokens"] for r in valid)
|
||||
briefed_in_total = sum(r["delta"]["briefed_input_tokens"] for r in valid)
|
||||
raw_out_total = sum(r["delta"]["raw_output_tokens"] for r in valid)
|
||||
briefed_out_total = sum(r["delta"]["briefed_output_tokens"] for r in valid)
|
||||
|
||||
HAIKU_IN = 1.0
|
||||
HAIKU_OUT = 5.0
|
||||
raw_cost = (raw_in_total * HAIKU_IN + raw_out_total * HAIKU_OUT) / 1_000_000
|
||||
briefed_cost = (briefed_in_total * HAIKU_IN + briefed_out_total * HAIKU_OUT) / 1_000_000
|
||||
|
||||
mean_red, ci_half = ci_95(reductions)
|
||||
mean_out_delta, _ = ci_95(output_deltas)
|
||||
|
||||
summary = {
|
||||
"experiment": "005",
|
||||
"title": "Actual API Token Measurement",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"model": MODEL,
|
||||
"extraction_prompt": EXTRACTION_PROMPT,
|
||||
"n_documents_attempted": len(docs_meta),
|
||||
"n_skipped_not_in_pgvector": len(skipped),
|
||||
"n_valid_pairs": len(valid),
|
||||
"n_failed": len(docs_meta) - len(valid) - len(skipped),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"input_token_reduction": {
|
||||
"mean_pct": round(mean_red, 2),
|
||||
"ci_95_half_width_pct": round(ci_half, 2),
|
||||
"median_pct": round(statistics.median(reductions), 2) if reductions else None,
|
||||
"min_pct": round(min(reductions), 2) if reductions else None,
|
||||
"max_pct": round(max(reductions), 2) if reductions else None,
|
||||
"stdev_pct": round(statistics.stdev(reductions), 2) if len(reductions) > 1 else 0.0,
|
||||
},
|
||||
"output_token_delta": {"mean_pct": round(mean_out_delta, 2)},
|
||||
"totals": {
|
||||
"raw_input_tokens": raw_in_total,
|
||||
"briefed_input_tokens": briefed_in_total,
|
||||
"raw_output_tokens": raw_out_total,
|
||||
"briefed_output_tokens": briefed_out_total,
|
||||
"raw_cost_usd": round(raw_cost, 4),
|
||||
"briefed_cost_usd": round(briefed_cost, 4),
|
||||
"savings_usd": round(raw_cost - briefed_cost, 4),
|
||||
},
|
||||
"comparison_to_v2_estimate": {
|
||||
"v2_modeled_reduction_pct": 42.0,
|
||||
"measured_mean_reduction_pct": round(mean_red, 2),
|
||||
"delta_pct_points": round(mean_red - 42.0, 2),
|
||||
},
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(docs_meta)} valid pairs in {total_elapsed}s")
|
||||
if skipped:
|
||||
print(f"Skipped (not in pgvector): {len(skipped)}")
|
||||
print(f"Mean input token reduction: {mean_red:.2f}% +/- {ci_half:.2f}% (95% CI)")
|
||||
print(f"V2 modeled estimate: 42.0% | delta: {mean_red - 42.0:+.2f} pts")
|
||||
print(f"Mean output token delta: {mean_out_delta:+.2f}%")
|
||||
print(f"Total cost: ${raw_cost + briefed_cost:.4f}")
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+43
-8
@@ -299,22 +299,57 @@ class IngestHandler(FileSystemEventHandler):
|
||||
self.pending = False
|
||||
self.last_event = 0
|
||||
|
||||
def on_any_event(self, event):
|
||||
def _should_ignore(self, path: Path) -> bool:
|
||||
if path.name.startswith((".", "~$")):
|
||||
return True
|
||||
if "Admin/Backups" in str(path) or "Backups" in path.parts:
|
||||
return True
|
||||
if "Journal/Media" in str(path):
|
||||
return True
|
||||
return False
|
||||
|
||||
def on_created(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
path = Path(event.src_path)
|
||||
if path.suffix.lower() not in SUPPORTED:
|
||||
if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
|
||||
return
|
||||
if path.name.startswith((".", "~$")):
|
||||
log.info(f"Event: created {path}")
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
def on_modified(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
if "Admin/Backups" in str(path) or "Backups" in path.parts:
|
||||
path = Path(event.src_path)
|
||||
if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
|
||||
return
|
||||
if "Journal/Media" in str(path):
|
||||
log.info(f"Event: modified {path}")
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
def on_moved(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
if event.event_type not in ("modified", "created", "moved"):
|
||||
# Nextcloud WebDAV writes .part temp files then renames to final path.
|
||||
# src_path is the .part file; dest_path is the final filename.
|
||||
dest = Path(event.dest_path)
|
||||
if dest.suffix.lower() not in SUPPORTED or self._should_ignore(dest):
|
||||
return
|
||||
log.info(f"Event: {event.event_type} {event.src_path}")
|
||||
self.pending = True
|
||||
log.info(f"Event: moved -> {dest}")
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
def on_closed(self, event):
|
||||
# FileClosedEvent fires on the final file after Nextcloud completes write.
|
||||
# Belt-and-suspenders catch for any write pattern not caught by on_moved.
|
||||
if event.is_directory:
|
||||
return
|
||||
path = Path(event.src_path)
|
||||
if path.suffix.lower() not in SUPPORTED or self._should_ignore(path):
|
||||
return
|
||||
log.info(f"Event: closed {path}")
|
||||
self.pending = True
|
||||
self.last_event = time.time()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user