scripts/: separate production from experimental and deprecated
Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2, base_class, cascade, cost_test, briefing, consistency, token series). Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py, tier1_migration.py — under the bespoke decision both target retired substrate work). Removes 19 .bak* files from disk (gitignored, never tracked; git history is the durable record of every prior version). The 11 production scripts remain in scripts/. All systemd ExecStart paths, api.py subprocess calls, and cron jobs continue to resolve correctly — verified by grep against /etc/systemd/system/aaronai-*.service, scripts/ references in api.py, and the user crontab. Track 1 inventory cross-cutting finding: scripts/ mixed 11 production files with 32 experimental scripts and ~20 .bak files. After this commit a clean-room reader can identify the live workers from a directory listing alone. Found by Track 1 inventory 2026-05-02. See ~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning. After commit, run: 1. git log --oneline -3 — show the new commit on top 2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Audit Expansion Pack Generator — type-aware stratified draw of 12
|
||||
documents from base_class_validation_results.json for n=20 audit expansion.
|
||||
|
||||
Per audit-expansion-protocol.md amendment 2026-04-28:
|
||||
The seed=43 length-only random draw concentrated on course modules in the
|
||||
small and medium buckets, missing voice captures, syllabi, and
|
||||
conversational documents present in the candidate distribution.
|
||||
This script implements type-aware stratification within each length
|
||||
bucket to produce a sample representative of BirdAI's document-type mix.
|
||||
|
||||
Targets (12 total):
|
||||
small (4): 2 course_module + 2 voice_capture
|
||||
medium (4): 2 course_module + 1 syllabus + 1 other
|
||||
large (4): 1 course_ppt + 1 syllabus + 1 faculty_report + 1 conversational
|
||||
|
||||
Output: ~/aaronai/experiments/audit_expansion_pack.json
|
||||
|
||||
Usage:
|
||||
python3 ~/aaronai/scripts/audit_expansion_draw.py
|
||||
python3 ~/aaronai/scripts/audit_expansion_draw.py --dry-run
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
VALIDATION_RESULTS = EXPERIMENTS / "base_class_validation_results.json"
|
||||
EXISTING_AUDIT_PACK = EXPERIMENTS / "base_class_audit_pack.json"
|
||||
OUTPUT_FILE = EXPERIMENTS / "audit_expansion_pack.json"
|
||||
|
||||
SEED = 43
|
||||
|
||||
# Type-aware targets per bucket
|
||||
TYPE_TARGETS = {
|
||||
"small": {"course_module": 2, "voice_capture": 2},
|
||||
"medium": {"course_module": 2, "syllabus": 1, "other": 1},
|
||||
"large": {"course_ppt": 1, "syllabus": 1, "faculty_report": 1, "conversational": 1},
|
||||
}
|
||||
|
||||
|
||||
def classify(source, bucket):
|
||||
"""Map a source filename to a document type, scoped to bucket where
|
||||
type categories overlap (e.g., 'course_module' vs 'course_ppt')."""
|
||||
s = source.lower()
|
||||
|
||||
# Voice captures — pattern: YYYY-MM-DD-HH-MM-voice.md
|
||||
if re.match(r"\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-voice\.md$", source):
|
||||
return "voice_capture"
|
||||
|
||||
# Conversational exports — pattern: "Claude: ..." or "ChatGPT: ..."
|
||||
if source.startswith("Claude:") or source.startswith("ChatGPT:"):
|
||||
return "conversational"
|
||||
|
||||
# Syllabus — must contain "syllabus" in the name
|
||||
if "syllabus" in s:
|
||||
return "syllabus"
|
||||
|
||||
# Faculty / annual reports
|
||||
if "faculty report" in s or "annual report" in s:
|
||||
return "faculty_report"
|
||||
|
||||
# Course PPTs (large bucket) — pattern: "_PPT_" or "_v3.pptx" or "Mod0N_"
|
||||
if bucket == "large" and (".pptx" in s or "_ppt_" in s or re.match(r"mod\d+_", s)):
|
||||
return "course_ppt"
|
||||
|
||||
# Course modules (small/medium bucket) — pattern: "0N_*.docx" or numeric prefix
|
||||
if re.match(r"^\d{2}_", source):
|
||||
return "course_module"
|
||||
|
||||
# Everything else falls into 'other' for medium; not used in small/large targets
|
||||
return "other"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not VALIDATION_RESULTS.exists():
|
||||
print(f"ERROR: {VALIDATION_RESULTS} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
with open(VALIDATION_RESULTS) as f:
|
||||
validation = json.load(f)
|
||||
|
||||
all_docs = validation["results"]
|
||||
print(f"Loaded {len(all_docs)} documents from validation results")
|
||||
print(f"Experiment: {validation.get('title', 'unknown')}")
|
||||
|
||||
# Load existing audit pack to exclude its sources (audit pack uses 'pairs')
|
||||
excluded_sources = set()
|
||||
if EXISTING_AUDIT_PACK.exists():
|
||||
with open(EXISTING_AUDIT_PACK) as f:
|
||||
existing = json.load(f)
|
||||
existing_pairs = existing.get("pairs", existing.get("results", existing))
|
||||
for doc in existing_pairs:
|
||||
src = doc.get("source")
|
||||
if src:
|
||||
excluded_sources.add(src)
|
||||
print(f"Excluding {len(excluded_sources)} sources already in audit pack")
|
||||
|
||||
# Filter to valid candidates
|
||||
valid_docs = []
|
||||
for doc in all_docs:
|
||||
src = doc.get("source")
|
||||
if src in excluded_sources:
|
||||
continue
|
||||
if not doc.get("condition_a") or not doc.get("condition_b"):
|
||||
continue
|
||||
bucket = doc.get("size_bucket")
|
||||
if bucket not in TYPE_TARGETS:
|
||||
continue
|
||||
doc["_type"] = classify(src, bucket)
|
||||
valid_docs.append(doc)
|
||||
|
||||
print(f"Valid candidate documents: {len(valid_docs)}")
|
||||
|
||||
# Print what's available per (bucket, type) before drawing
|
||||
print(f"\nCandidates by (bucket, type):")
|
||||
for bucket in TYPE_TARGETS:
|
||||
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
|
||||
types_in_bucket = {}
|
||||
for d in bucket_docs:
|
||||
types_in_bucket.setdefault(d["_type"], []).append(d)
|
||||
print(f" {bucket}:")
|
||||
for t in sorted(types_in_bucket.keys()):
|
||||
target = TYPE_TARGETS[bucket].get(t, "—")
|
||||
print(f" {t:>16}: {len(types_in_bucket[t])} avail, target {target}")
|
||||
|
||||
# Stratified type-aware draw
|
||||
random.seed(SEED)
|
||||
drawn = []
|
||||
warnings = []
|
||||
for bucket, type_targets in TYPE_TARGETS.items():
|
||||
bucket_docs = [d for d in valid_docs if d["size_bucket"] == bucket]
|
||||
for doc_type, target in type_targets.items():
|
||||
type_docs = [d for d in bucket_docs if d["_type"] == doc_type]
|
||||
if len(type_docs) < target:
|
||||
msg = (f"WARNING: bucket={bucket} type={doc_type} "
|
||||
f"available={len(type_docs)} target={target}")
|
||||
warnings.append(msg)
|
||||
print(msg, file=sys.stderr)
|
||||
n_to_draw = min(target, len(type_docs))
|
||||
sample = random.sample(type_docs, n_to_draw)
|
||||
drawn.extend(sample)
|
||||
|
||||
# Report draw
|
||||
print(f"\nDrew {len(drawn)} documents:")
|
||||
for d in drawn:
|
||||
src = d.get("source", "<unknown>")
|
||||
chars = d.get("doc_chars_original", 0)
|
||||
bucket = d.get("size_bucket", "?")
|
||||
doc_type = d.get("_type", "?")
|
||||
truncated = " (TRUNCATED)" if d.get("truncated") else ""
|
||||
print(f" [{bucket:>6}/{doc_type:>16}] {chars:>6}c {src}{truncated}")
|
||||
|
||||
# Bucket-level summary
|
||||
bucket_counts = {"small": 0, "medium": 0, "large": 0}
|
||||
for d in drawn:
|
||||
bucket_counts[d["size_bucket"]] += 1
|
||||
print(f"\nBucket totals: {bucket_counts}")
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\n--dry-run set, not writing output file")
|
||||
return
|
||||
|
||||
output = {
|
||||
"metadata": {
|
||||
"generated_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"source_validation_file": str(VALIDATION_RESULTS),
|
||||
"seed": SEED,
|
||||
"stratification": "type-aware within length bucket",
|
||||
"type_targets": TYPE_TARGETS,
|
||||
"bucket_counts": bucket_counts,
|
||||
"excluded_count": len(excluded_sources),
|
||||
"warnings": warnings,
|
||||
"purpose": "n=20 audit expansion per audit-expansion-protocol.md (type-aware amendment)",
|
||||
},
|
||||
"results": drawn,
|
||||
}
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(output, f, indent=2, default=str)
|
||||
print(f"\nWrote {OUTPUT_FILE}")
|
||||
print(f" {len(drawn)} documents ready for rating")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,605 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base-Class Enrichment Test — OOP Framing Experiment
|
||||
|
||||
Tests whether non-entity metadata from a local model (domain class, structural
|
||||
signals, presence flags, length, summary) can take load off the API without
|
||||
constraining what it extracts.
|
||||
|
||||
The local model does NOT draft entities. The API still does full extraction.
|
||||
The local model produces metadata that orients the API's reading.
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction, no metadata
|
||||
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
|
||||
|
||||
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
|
||||
If B produces fewer edges or less predicate diversity, metadata is acting as
|
||||
constraint and the OOP framing is falsified.
|
||||
|
||||
Sample: 50 docs from briefing_test_v2_results.json:
|
||||
- 15 small (<1000 chars)
|
||||
- 25 medium (1000-5000 chars)
|
||||
- 10 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Outputs: ~/aaronai/experiments/base_class_audit_rerun_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_audit_rerun_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 8192
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"language": "en or other",
|
||||
"char_length": integer,
|
||||
"primary_format": "prose, presentation, list, form, code, or mixed",
|
||||
"structural_signals": {
|
||||
"has_headings": boolean,
|
||||
"has_bullet_lists": boolean,
|
||||
"has_numbered_lists": boolean,
|
||||
"has_tables": boolean,
|
||||
"has_code_blocks": boolean,
|
||||
"has_dates": boolean
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": boolean,
|
||||
"has_institutional_language": boolean,
|
||||
"has_technical_terminology": boolean,
|
||||
"has_first_person": boolean,
|
||||
"has_quotations": boolean
|
||||
},
|
||||
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
|
||||
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
|
||||
}
|
||||
|
||||
JSON only, no commentary.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
|
||||
|
||||
DOCUMENT METADATA:
|
||||
{metadata_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local_metadata(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_METADATA_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph_full(raw):
|
||||
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None, False
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None, False
|
||||
if not isinstance(data, dict):
|
||||
return None, None, False
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return ents, edges, True
|
||||
return None, None, False
|
||||
|
||||
|
||||
def parse_metadata(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def graph_metrics(entities, edges):
|
||||
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
|
||||
if entities is None or edges is None:
|
||||
return None
|
||||
n_entities = len(entities)
|
||||
n_edges = len(edges)
|
||||
|
||||
# Predicate diversity
|
||||
predicates = set()
|
||||
for e in edges:
|
||||
if isinstance(e, dict):
|
||||
p = e.get("predicate")
|
||||
if p:
|
||||
predicates.add(str(p).strip().lower())
|
||||
predicate_diversity = len(predicates)
|
||||
|
||||
# Entity type diversity
|
||||
types = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
t = ent.get("type")
|
||||
if t:
|
||||
types.add(str(t).strip().lower())
|
||||
type_diversity = len(types)
|
||||
|
||||
# Average degree (edges*2 / entities — each edge touches two nodes)
|
||||
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
|
||||
|
||||
# Largest connected component
|
||||
# Build adjacency from edges
|
||||
entity_names = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
n = ent.get("name")
|
||||
if n:
|
||||
entity_names.add(str(n).strip().lower())
|
||||
|
||||
adj = {name: set() for name in entity_names}
|
||||
for e in edges:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
s = str(e.get("subject", "")).strip().lower()
|
||||
o = str(e.get("object", "")).strip().lower()
|
||||
if s in adj and o in adj:
|
||||
adj[s].add(o)
|
||||
adj[o].add(s)
|
||||
|
||||
# BFS for largest component
|
||||
visited = set()
|
||||
largest = 0
|
||||
for start in adj:
|
||||
if start in visited:
|
||||
continue
|
||||
component = 0
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in visited:
|
||||
continue
|
||||
visited.add(node)
|
||||
component += 1
|
||||
for neighbor in adj[node]:
|
||||
if neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if component > largest:
|
||||
largest = component
|
||||
|
||||
return {
|
||||
"n_entities": n_entities,
|
||||
"n_edges": n_edges,
|
||||
"predicate_diversity": predicate_diversity,
|
||||
"type_diversity": type_diversity,
|
||||
"avg_degree": round(avg_degree, 2),
|
||||
"largest_component": largest,
|
||||
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Audit re-run: load the 10 audit docs from base_class_audit_pack.json."""
|
||||
import json as _json
|
||||
audit_file = Path.home() / "aaronai" / "experiments" / "base_class_audit_pack.json"
|
||||
if not audit_file.exists():
|
||||
print(f"ERROR: {audit_file} not found")
|
||||
return []
|
||||
audit = _json.loads(audit_file.read_text())
|
||||
audit_sources = [p["source"] for p in audit["pairs"]]
|
||||
|
||||
# Synthesize doc_meta entries for the audit sources
|
||||
sample = [{"source": s, "content_length": 0, "status": "SUCCESS"}
|
||||
for s in audit_sources]
|
||||
print(f"Audit re-run: {len(sample)} docs from base_class_audit_pack.json")
|
||||
return sample
|
||||
|
||||
|
||||
def fmt_metrics(m):
|
||||
if m is None:
|
||||
return "n/a"
|
||||
return (f"e={m['n_entities']} edge={m['n_edges']} "
|
||||
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
|
||||
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
|
||||
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
|
||||
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(a_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_metrics = None
|
||||
|
||||
# Condition B local metadata pass
|
||||
local_result = call_local_metadata(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']}", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
metadata = parse_metadata(local_raw)
|
||||
# Override LLM-hallucinated char_length with Python-computed truth
|
||||
if metadata is not None and isinstance(metadata, dict):
|
||||
metadata["char_length"] = len(doc_text)
|
||||
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
|
||||
flush=True)
|
||||
|
||||
if metadata is None:
|
||||
print(f" B: metadata parse failed — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "metadata_parse_failed",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
|
||||
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
|
||||
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(b_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_metrics = None
|
||||
|
||||
# Per-doc deltas
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
pred_pct_str = "n/a"
|
||||
if a_metrics and b_metrics:
|
||||
if a_metrics["n_edges"] > 0:
|
||||
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
|
||||
if a_metrics["predicate_diversity"] > 0:
|
||||
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
|
||||
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
|
||||
flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_metadata": metadata,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"metrics": b_metrics,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:32000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("metrics") is not None
|
||||
and r.get("condition_b", {}).get("metrics") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
def avg_metric(rows, condition, key):
|
||||
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
|
||||
return round(statistics.mean(vals), 2) if vals else None
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
|
||||
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
|
||||
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
|
||||
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
|
||||
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
|
||||
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
|
||||
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
|
||||
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
|
||||
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
|
||||
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
|
||||
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
|
||||
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "base_class_test",
|
||||
"title": "Base-Class Enrichment — OOP Framing",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By bucket — graph metrics (A vs B):")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}):")
|
||||
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
|
||||
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
|
||||
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
|
||||
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
|
||||
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
|
||||
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
|
||||
print()
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,593 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base-Class Enrichment Test — OOP Framing Experiment
|
||||
|
||||
Tests whether non-entity metadata from a local model (domain class, structural
|
||||
signals, presence flags, length, summary) can take load off the API without
|
||||
constraining what it extracts.
|
||||
|
||||
The local model does NOT draft entities. The API still does full extraction.
|
||||
The local model produces metadata that orients the API's reading.
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction, no metadata
|
||||
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
|
||||
|
||||
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
|
||||
If B produces fewer edges or less predicate diversity, metadata is acting as
|
||||
constraint and the OOP framing is falsified.
|
||||
|
||||
Sample: 20 docs from briefing_test_v2_results.json:
|
||||
- 5 small (<1000 chars)
|
||||
- 10 medium (1000-5000 chars)
|
||||
- 5 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Outputs: ~/aaronai/experiments/base_class_test_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_test_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 4096
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"language": "en or other",
|
||||
"char_length": integer,
|
||||
"primary_format": "prose, presentation, list, form, code, or mixed",
|
||||
"structural_signals": {
|
||||
"has_headings": boolean,
|
||||
"has_bullet_lists": boolean,
|
||||
"has_numbered_lists": boolean,
|
||||
"has_tables": boolean,
|
||||
"has_code_blocks": boolean,
|
||||
"has_dates": boolean
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": boolean,
|
||||
"has_institutional_language": boolean,
|
||||
"has_technical_terminology": boolean,
|
||||
"has_first_person": boolean,
|
||||
"has_quotations": boolean
|
||||
},
|
||||
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
|
||||
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
|
||||
}
|
||||
|
||||
JSON only, no commentary.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
|
||||
|
||||
DOCUMENT METADATA:
|
||||
{metadata_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local_metadata(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_METADATA_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph_full(raw):
|
||||
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None, False
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None, False
|
||||
if not isinstance(data, dict):
|
||||
return None, None, False
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return ents, edges, True
|
||||
return None, None, False
|
||||
|
||||
|
||||
def parse_metadata(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def graph_metrics(entities, edges):
|
||||
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
|
||||
if entities is None or edges is None:
|
||||
return None
|
||||
n_entities = len(entities)
|
||||
n_edges = len(edges)
|
||||
|
||||
# Predicate diversity
|
||||
predicates = set()
|
||||
for e in edges:
|
||||
if isinstance(e, dict):
|
||||
p = e.get("predicate")
|
||||
if p:
|
||||
predicates.add(str(p).strip().lower())
|
||||
predicate_diversity = len(predicates)
|
||||
|
||||
# Entity type diversity
|
||||
types = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
t = ent.get("type")
|
||||
if t:
|
||||
types.add(str(t).strip().lower())
|
||||
type_diversity = len(types)
|
||||
|
||||
# Average degree (edges*2 / entities — each edge touches two nodes)
|
||||
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
|
||||
|
||||
# Largest connected component
|
||||
# Build adjacency from edges
|
||||
entity_names = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
n = ent.get("name")
|
||||
if n:
|
||||
entity_names.add(str(n).strip().lower())
|
||||
|
||||
adj = {name: set() for name in entity_names}
|
||||
for e in edges:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
s = str(e.get("subject", "")).strip().lower()
|
||||
o = str(e.get("object", "")).strip().lower()
|
||||
if s in adj and o in adj:
|
||||
adj[s].add(o)
|
||||
adj[o].add(s)
|
||||
|
||||
# BFS for largest component
|
||||
visited = set()
|
||||
largest = 0
|
||||
for start in adj:
|
||||
if start in visited:
|
||||
continue
|
||||
component = 0
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in visited:
|
||||
continue
|
||||
visited.add(node)
|
||||
component += 1
|
||||
for neighbor in adj[node]:
|
||||
if neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if component > largest:
|
||||
largest = component
|
||||
|
||||
return {
|
||||
"n_entities": n_entities,
|
||||
"n_edges": n_edges,
|
||||
"predicate_diversity": predicate_diversity,
|
||||
"type_diversity": type_diversity,
|
||||
"avg_degree": round(avg_degree, 2),
|
||||
"largest_component": largest,
|
||||
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000]
|
||||
large = [d for d, n in sized if n >= 5000]
|
||||
return small[:5] + medium[:10] + large[:5]
|
||||
|
||||
|
||||
def fmt_metrics(m):
|
||||
if m is None:
|
||||
return "n/a"
|
||||
return (f"e={m['n_entities']} edge={m['n_edges']} "
|
||||
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
|
||||
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (5s/10m/5l, file order)")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
|
||||
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
|
||||
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(a_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_metrics = None
|
||||
|
||||
# Condition B local metadata pass
|
||||
local_result = call_local_metadata(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']}", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
metadata = parse_metadata(local_raw)
|
||||
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
|
||||
flush=True)
|
||||
|
||||
if metadata is None:
|
||||
print(f" B: metadata parse failed — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "metadata_parse_failed",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
|
||||
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
|
||||
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(b_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_metrics = None
|
||||
|
||||
# Per-doc deltas
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
pred_pct_str = "n/a"
|
||||
if a_metrics and b_metrics:
|
||||
if a_metrics["n_edges"] > 0:
|
||||
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
|
||||
if a_metrics["predicate_diversity"] > 0:
|
||||
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
|
||||
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
|
||||
flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_metadata": metadata,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"metrics": b_metrics,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:4000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("metrics") is not None
|
||||
and r.get("condition_b", {}).get("metrics") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
def avg_metric(rows, condition, key):
|
||||
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
|
||||
return round(statistics.mean(vals), 2) if vals else None
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
|
||||
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
|
||||
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
|
||||
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
|
||||
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
|
||||
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
|
||||
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
|
||||
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
|
||||
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
|
||||
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
|
||||
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
|
||||
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "base_class_test",
|
||||
"title": "Base-Class Enrichment — OOP Framing",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By bucket — graph metrics (A vs B):")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}):")
|
||||
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
|
||||
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
|
||||
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
|
||||
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
|
||||
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
|
||||
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
|
||||
print()
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,611 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base-Class Enrichment Test — OOP Framing Experiment
|
||||
|
||||
Tests whether non-entity metadata from a local model (domain class, structural
|
||||
signals, presence flags, length, summary) can take load off the API without
|
||||
constraining what it extracts.
|
||||
|
||||
The local model does NOT draft entities. The API still does full extraction.
|
||||
The local model produces metadata that orients the API's reading.
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction, no metadata
|
||||
B — Base-class: Mistral metadata + Haiku full extraction with metadata as frame
|
||||
|
||||
Critical test: B's edge count and predicate diversity must be ≥A's, or close.
|
||||
If B produces fewer edges or less predicate diversity, metadata is acting as
|
||||
constraint and the OOP framing is falsified.
|
||||
|
||||
Sample: 50 docs from briefing_test_v2_results.json:
|
||||
- 15 small (<1000 chars)
|
||||
- 25 medium (1000-5000 chars)
|
||||
- 10 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Outputs: ~/aaronai/experiments/base_class_validation_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "base_class_validation_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 8192
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_METADATA_PROMPT = """Analyze the document below and produce metadata describing its surface features. Do NOT extract entities. Do NOT identify content. Only produce structural and surface-level metadata.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"language": "en or other",
|
||||
"char_length": integer,
|
||||
"primary_format": "prose, presentation, list, form, code, or mixed",
|
||||
"structural_signals": {
|
||||
"has_headings": boolean,
|
||||
"has_bullet_lists": boolean,
|
||||
"has_numbered_lists": boolean,
|
||||
"has_tables": boolean,
|
||||
"has_code_blocks": boolean,
|
||||
"has_dates": boolean
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": boolean,
|
||||
"has_institutional_language": boolean,
|
||||
"has_technical_terminology": boolean,
|
||||
"has_first_person": boolean,
|
||||
"has_quotations": boolean
|
||||
},
|
||||
"domain_class": "technical, administrative, personal, educational, creative, reference, or mixed",
|
||||
"one_sentence_summary": "string of 25 words or fewer describing what the document is about"
|
||||
}
|
||||
|
||||
JSON only, no commentary.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT = """You are extracting a knowledge graph from a document. The document has been pre-analyzed by a local model and the following metadata is provided as orienting context — not as constraint. Extract every entity and every relationship in the document. Do not limit your extraction to what the metadata suggests; the metadata is here to orient your reading, not to bound it.
|
||||
|
||||
DOCUMENT METADATA:
|
||||
{metadata_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. Do not filter for salience. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local_metadata(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_METADATA_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph_full(raw):
|
||||
"""Return (entities_list, edges_list, parsed_ok). Lists for metric computation."""
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None, False
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None, False
|
||||
if not isinstance(data, dict):
|
||||
return None, None, False
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return ents, edges, True
|
||||
return None, None, False
|
||||
|
||||
|
||||
def parse_metadata(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
|
||||
|
||||
def graph_metrics(entities, edges):
|
||||
"""Compute graph quality metrics. Inputs are lists from parse_graph_full."""
|
||||
if entities is None or edges is None:
|
||||
return None
|
||||
n_entities = len(entities)
|
||||
n_edges = len(edges)
|
||||
|
||||
# Predicate diversity
|
||||
predicates = set()
|
||||
for e in edges:
|
||||
if isinstance(e, dict):
|
||||
p = e.get("predicate")
|
||||
if p:
|
||||
predicates.add(str(p).strip().lower())
|
||||
predicate_diversity = len(predicates)
|
||||
|
||||
# Entity type diversity
|
||||
types = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
t = ent.get("type")
|
||||
if t:
|
||||
types.add(str(t).strip().lower())
|
||||
type_diversity = len(types)
|
||||
|
||||
# Average degree (edges*2 / entities — each edge touches two nodes)
|
||||
avg_degree = (2 * n_edges / n_entities) if n_entities > 0 else 0.0
|
||||
|
||||
# Largest connected component
|
||||
# Build adjacency from edges
|
||||
entity_names = set()
|
||||
for ent in entities:
|
||||
if isinstance(ent, dict):
|
||||
n = ent.get("name")
|
||||
if n:
|
||||
entity_names.add(str(n).strip().lower())
|
||||
|
||||
adj = {name: set() for name in entity_names}
|
||||
for e in edges:
|
||||
if not isinstance(e, dict):
|
||||
continue
|
||||
s = str(e.get("subject", "")).strip().lower()
|
||||
o = str(e.get("object", "")).strip().lower()
|
||||
if s in adj and o in adj:
|
||||
adj[s].add(o)
|
||||
adj[o].add(s)
|
||||
|
||||
# BFS for largest component
|
||||
visited = set()
|
||||
largest = 0
|
||||
for start in adj:
|
||||
if start in visited:
|
||||
continue
|
||||
component = 0
|
||||
stack = [start]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node in visited:
|
||||
continue
|
||||
visited.add(node)
|
||||
component += 1
|
||||
for neighbor in adj[node]:
|
||||
if neighbor not in visited:
|
||||
stack.append(neighbor)
|
||||
if component > largest:
|
||||
largest = component
|
||||
|
||||
return {
|
||||
"n_entities": n_entities,
|
||||
"n_edges": n_edges,
|
||||
"predicate_diversity": predicate_diversity,
|
||||
"type_diversity": type_diversity,
|
||||
"avg_degree": round(avg_degree, 2),
|
||||
"largest_component": largest,
|
||||
"largest_component_pct": round(100 * largest / n_entities, 1) if n_entities else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Pick small + medium from v2; large bucket is loaded separately from
|
||||
large_bucket_sources.json (sampled fresh from pgvector since v2 has no large docs)."""
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000][:15]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000][:25]
|
||||
|
||||
# Load large bucket from external sources file
|
||||
import json as _json
|
||||
large_sources_file = Path.home() / "aaronai" / "large_bucket_sources.json"
|
||||
if large_sources_file.exists():
|
||||
large_source_names = _json.loads(large_sources_file.read_text())
|
||||
# Synthesize doc_meta entries for the large sources
|
||||
large = [{"source": s, "content_length": 0, "status": "SUCCESS"}
|
||||
for s in large_source_names]
|
||||
print(f"Stratify: 15 small + 25 medium from v2, 10 large from large_bucket_sources.json")
|
||||
else:
|
||||
large = []
|
||||
print("WARN: large_bucket_sources.json not found, no large docs in sample")
|
||||
|
||||
return small + medium + large
|
||||
|
||||
|
||||
def fmt_metrics(m):
|
||||
if m is None:
|
||||
return "n/a"
|
||||
return (f"e={m['n_entities']} edge={m['n_edges']} "
|
||||
f"pred={m['predicate_diversity']} type={m['type_diversity']} "
|
||||
f"deg={m['avg_degree']} comp={m['largest_component']}/{m['n_entities']}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (15s/25m/10l, file order)")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE}")
|
||||
print(f"Test: base-class metadata as orienting frame, NOT entity drafting")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges, a_ok = parse_graph_full(a["response_text"])
|
||||
a_metrics = graph_metrics(a_ents, a_edges) if a_ok else None
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"stop={a['stop_reason']} t={a['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(a_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_metrics = None
|
||||
|
||||
# Condition B local metadata pass
|
||||
local_result = call_local_metadata(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']}", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
metadata = parse_metadata(local_raw)
|
||||
# Override LLM-hallucinated char_length with Python-computed truth
|
||||
if metadata is not None and isinstance(metadata, dict):
|
||||
metadata["char_length"] = len(doc_text)
|
||||
print(f" B local: t={local_result['latency_s']}s metadata_parsed={metadata is not None}",
|
||||
flush=True)
|
||||
|
||||
if metadata is None:
|
||||
print(f" B: metadata parse failed — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "metadata_parse_failed",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False, indent=2)
|
||||
b_prompt = CONDITION_B_API_PROMPT.replace("{metadata_json}", metadata_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges, b_ok = parse_graph_full(b["response_text"])
|
||||
b_metrics = graph_metrics(b_ents, b_edges) if b_ok else None
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"stop={b['stop_reason']} t={b['latency_s']}s", flush=True)
|
||||
print(f" {fmt_metrics(b_metrics)}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_metrics = None
|
||||
|
||||
# Per-doc deltas
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
pred_pct_str = "n/a"
|
||||
if a_metrics and b_metrics:
|
||||
if a_metrics["n_edges"] > 0:
|
||||
edge_pct_str = f"{(b_metrics['n_edges'] - a_metrics['n_edges']) / a_metrics['n_edges'] * 100:+.1f}%"
|
||||
if a_metrics["predicate_diversity"] > 0:
|
||||
pred_pct_str = f"{(b_metrics['predicate_diversity'] - a_metrics['predicate_diversity']) / a_metrics['predicate_diversity'] * 100:+.1f}%"
|
||||
print(f" Δ in={in_pct:+.1f}% out={out_pct:+.1f}% edges={edge_pct_str} pred={pred_pct_str}",
|
||||
flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"metrics": a_metrics,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:32000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_metadata": metadata,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"metrics": b_metrics,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:32000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("metrics") is not None
|
||||
and r.get("condition_b", {}).get("metrics") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
def avg_metric(rows, condition, key):
|
||||
vals = [r[condition]["metrics"][key] for r in rows if r[condition]["metrics"]]
|
||||
return round(statistics.mean(vals), 2) if vals else None
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_entities": avg_metric(rows, "condition_a", "n_entities"),
|
||||
"b_avg_entities": avg_metric(rows, "condition_b", "n_entities"),
|
||||
"a_avg_edges": avg_metric(rows, "condition_a", "n_edges"),
|
||||
"b_avg_edges": avg_metric(rows, "condition_b", "n_edges"),
|
||||
"a_avg_predicate_diversity": avg_metric(rows, "condition_a", "predicate_diversity"),
|
||||
"b_avg_predicate_diversity": avg_metric(rows, "condition_b", "predicate_diversity"),
|
||||
"a_avg_type_diversity": avg_metric(rows, "condition_a", "type_diversity"),
|
||||
"b_avg_type_diversity": avg_metric(rows, "condition_b", "type_diversity"),
|
||||
"a_avg_degree": avg_metric(rows, "condition_a", "avg_degree"),
|
||||
"b_avg_degree": avg_metric(rows, "condition_b", "avg_degree"),
|
||||
"a_avg_largest_component_pct": avg_metric(rows, "condition_a", "largest_component_pct"),
|
||||
"b_avg_largest_component_pct": avg_metric(rows, "condition_b", "largest_component_pct"),
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "base_class_test",
|
||||
"title": "Base-Class Enrichment — OOP Framing",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By bucket — graph metrics (A vs B):")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}):")
|
||||
print(f" cost: in {stats['input_delta_pct']:+.1f}% out {stats['output_delta_pct']:+.1f}%")
|
||||
print(f" entities: A={stats['a_avg_entities']} B={stats['b_avg_entities']}")
|
||||
print(f" edges: A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print(f" predicate diversity: A={stats['a_avg_predicate_diversity']} B={stats['b_avg_predicate_diversity']}")
|
||||
print(f" type diversity: A={stats['a_avg_type_diversity']} B={stats['b_avg_type_diversity']}")
|
||||
print(f" avg degree: A={stats['a_avg_degree']} B={stats['b_avg_degree']}")
|
||||
print(f" largest component %: A={stats['a_avg_largest_component_pct']} B={stats['b_avg_largest_component_pct']}")
|
||||
print()
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BirdAI Briefing Generator v2 — Experiment 002b
|
||||
===============================================
|
||||
Changes from v1 (based on Experiment 004 human evaluation):
|
||||
- document_type now pre-classified by rule, not by model
|
||||
- Capture template header stripped before model sees content
|
||||
- noise_signals constrained to controlled vocabulary
|
||||
- Model prompt simplified — focuses only on reliable signal fields
|
||||
- Expanded document type vocabulary for BirdAI-specific types
|
||||
Results written to ~/aaronai/briefing_test_v2_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import hashlib
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_v2_results.json")
|
||||
MODEL = "mistral"
|
||||
SAMPLE_SIZE = 50
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
|
||||
VALID_DOC_TYPES = {
|
||||
"voice_capture", "image_capture",
|
||||
"dream_nrem", "dream_rem", "dream_lucid", "dream_synthesis",
|
||||
"presentation", "code", "spreadsheet",
|
||||
"academic_pdf", "technical_doc", "chat_log",
|
||||
"book_excerpt", "form", "syllabus", "email",
|
||||
"notes", "purchase_order", "annual_report",
|
||||
"invoice", "memo", "report", "unknown"
|
||||
}
|
||||
|
||||
VALID_DENSITIES = {"high", "medium", "low"}
|
||||
VALID_PRIORITIES = {"full", "partial", "skip"}
|
||||
|
||||
VALID_NOISE_SIGNALS = {
|
||||
"repeated_headers", "page_numbers", "formatting_artifacts",
|
||||
"boilerplate", "watermarks", "footers", "line_numbers",
|
||||
"encoding_artifacts", "ocr_errors"
|
||||
}
|
||||
|
||||
VALID_STRUCTURE_SIGNALS = {
|
||||
"headings", "bullet_lists", "numbered_lists", "tables",
|
||||
"code_blocks", "citations", "footnotes", "images",
|
||||
"forms", "columns", "sections"
|
||||
}
|
||||
|
||||
|
||||
def pre_classify_document(source, content):
|
||||
filename = os.path.basename(source).lower()
|
||||
doc_type = None
|
||||
cleaned_content = content
|
||||
|
||||
if "---" in content:
|
||||
parts = content.split("---", 1)
|
||||
header = parts[0].lower()
|
||||
body = parts[1].strip() if len(parts) > 1 else content
|
||||
if any(marker in header for marker in ["**type:**", "**modality:**", "# capture", "# dream"]):
|
||||
cleaned_content = body if body else content
|
||||
|
||||
if "nrem" in filename:
|
||||
doc_type = "dream_nrem"
|
||||
elif "lucid" in filename:
|
||||
doc_type = "dream_lucid"
|
||||
elif "-rem-" in filename or filename.endswith("-rem.md"):
|
||||
doc_type = "dream_rem"
|
||||
elif "synthesis" in filename and filename.endswith(".md"):
|
||||
doc_type = "dream_synthesis"
|
||||
elif "-voice" in filename or "voice-" in filename:
|
||||
doc_type = "voice_capture"
|
||||
elif "-image" in filename or "image-" in filename:
|
||||
doc_type = "image_capture"
|
||||
elif filename.endswith(".pptx") or filename.endswith(".ppt"):
|
||||
doc_type = "presentation"
|
||||
elif filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".csv"):
|
||||
doc_type = "spreadsheet"
|
||||
elif any(filename.endswith(ext) for ext in [".py", ".js", ".ts", ".cpp", ".c", ".h", ".java", ".rs"]):
|
||||
doc_type = "code"
|
||||
elif filename.endswith("cmakelists.txt") or filename == "makefile":
|
||||
doc_type = "code"
|
||||
elif content.startswith("# Dream"):
|
||||
if "nrem" in content[:50].lower():
|
||||
doc_type = "dream_nrem"
|
||||
elif "lucid" in content[:50].lower():
|
||||
doc_type = "dream_lucid"
|
||||
elif "rem" in content[:50].lower():
|
||||
doc_type = "dream_rem"
|
||||
else:
|
||||
doc_type = "dream_synthesis"
|
||||
elif content.startswith("# Capture"):
|
||||
doc_type = "voice_capture" if "voice" in content[:100].lower() else "image_capture"
|
||||
|
||||
return doc_type, cleaned_content
|
||||
|
||||
|
||||
def build_briefing_prompt(content, pre_classified_type=None):
|
||||
if pre_classified_type:
|
||||
type_instruction = f'\n "document_type": "{pre_classified_type}", // pre-classified, do not change'
|
||||
else:
|
||||
type_instruction = '\n "document_type": "one of: academic_pdf, technical_doc, chat_log, book_excerpt, form, syllabus, email, notes, purchase_order, annual_report, invoice, memo, report, unknown",'
|
||||
|
||||
return f"""Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
|
||||
|
||||
Return exactly this structure:
|
||||
{{{type_instruction}
|
||||
"primary_language": "language code e.g. en, fr, de",
|
||||
"density": "one of: high, medium, low",
|
||||
"has_proper_nouns": true or false,
|
||||
"has_dates": true or false,
|
||||
"has_numeric_data": true or false,
|
||||
"has_institutional_language": true or false,
|
||||
"has_technical_terms": true or false,
|
||||
"likely_has_named_entities": true or false,
|
||||
"structure_signals": [],
|
||||
"noise_signals": [],
|
||||
"extraction_priority": "one of: full, partial, skip"
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- density: high=information dense technical or academic, medium=mixed, low=narrative/literary/sparse/short
|
||||
- has_proper_nouns: true if you see capitalized words that are NOT sentence starts or template headers
|
||||
- has_dates: true if you see date patterns (numbers with months, years, slashes)
|
||||
- has_numeric_data: true if you see measurements, percentages, statistics
|
||||
- has_institutional_language: true if you see words like university, department, policy, committee, grant
|
||||
- has_technical_terms: true if you see domain-specific jargon or acronyms
|
||||
- likely_has_named_entities: true if has_proper_nouns is true
|
||||
- structure_signals: use ONLY these terms: headings, bullet_lists, numbered_lists, tables, code_blocks, citations, footnotes, images, forms, columns, sections
|
||||
- noise_signals: use ONLY these terms: repeated_headers, page_numbers, formatting_artifacts, boilerplate, watermarks, footers, line_numbers, encoding_artifacts, ocr_errors
|
||||
- extraction_priority: full if density=high and likely_has_named_entities=true; skip if density=low AND likely_has_named_entities=false AND content is under 200 words; partial otherwise
|
||||
|
||||
Document:
|
||||
{content[:1500]}"""
|
||||
|
||||
|
||||
def get_sample_documents():
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ON (source) id, document, source, created_at
|
||||
FROM embeddings
|
||||
WHERE length(document) > 100
|
||||
AND length(document) < 3000
|
||||
ORDER BY source, random()
|
||||
LIMIT %s
|
||||
""", (SAMPLE_SIZE,))
|
||||
docs = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return docs
|
||||
|
||||
|
||||
def run_briefing(prompt):
|
||||
payload = json.dumps({"model": MODEL, "prompt": prompt, "stream": False}).encode()
|
||||
raw = ""
|
||||
try:
|
||||
req = urllib.request.Request(OLLAMA_URL, data=payload, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
result = json.loads(resp.read().decode())
|
||||
raw = result.get("response", "").strip()
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}") + 1
|
||||
if start == -1 or end == 0:
|
||||
return None, f"NO_JSON: {raw[:200]}"
|
||||
parsed = json.loads(raw[start:end])
|
||||
if not isinstance(parsed, dict):
|
||||
return None, f"NOT_DICT: {raw[:100]}"
|
||||
return parsed, raw
|
||||
except urllib.error.URLError as e:
|
||||
return None, f"URL_ERROR: {e}"
|
||||
except TimeoutError:
|
||||
return None, "TIMEOUT"
|
||||
except json.JSONDecodeError as e:
|
||||
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
|
||||
except Exception as e:
|
||||
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def sanitize_briefing(briefing, pre_classified_type=None):
|
||||
safe = {}
|
||||
if pre_classified_type:
|
||||
safe["document_type"] = pre_classified_type
|
||||
else:
|
||||
dt = str(briefing.get("document_type", "unknown")).lower().strip()
|
||||
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
|
||||
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
|
||||
density = str(briefing.get("density", "medium")).lower().strip()
|
||||
safe["density"] = density if density in VALID_DENSITIES else "medium"
|
||||
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
|
||||
"has_institutional_language", "has_technical_terms", "likely_has_named_entities"]:
|
||||
val = briefing.get(field, False)
|
||||
if isinstance(val, bool):
|
||||
safe[field] = val
|
||||
elif isinstance(val, str):
|
||||
safe[field] = val.lower() in ("true", "yes", "1")
|
||||
else:
|
||||
safe[field] = bool(val)
|
||||
for field, valid_set in [("structure_signals", VALID_STRUCTURE_SIGNALS),
|
||||
("noise_signals", VALID_NOISE_SIGNALS)]:
|
||||
val = briefing.get(field, [])
|
||||
if isinstance(val, list):
|
||||
safe[field] = [str(v).lower().strip() for v in val if str(v).lower().strip() in valid_set]
|
||||
elif isinstance(val, str) and val.lower().strip() in valid_set:
|
||||
safe[field] = [val.lower().strip()]
|
||||
else:
|
||||
safe[field] = []
|
||||
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
|
||||
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
|
||||
return safe
|
||||
|
||||
|
||||
def estimate_token_reduction(original_text, briefing):
|
||||
original_tokens = max(len(original_text) / 4, 1)
|
||||
orientation_saved = 200
|
||||
if briefing.get("extraction_priority") == "skip":
|
||||
return {"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": round(original_tokens + 200),
|
||||
"noise_reduction_pct": 100.0, "total_reduction_pct": 100.0,
|
||||
"note": "skip — no API call"}
|
||||
noise_count = len(briefing.get("noise_signals", []))
|
||||
noise_reduction_pct = min(noise_count * 0.05, 0.40)
|
||||
noise_tokens_saved = original_tokens * noise_reduction_pct
|
||||
total_saved = orientation_saved + noise_tokens_saved
|
||||
reduction_pct = min((total_saved / (original_tokens + 200)) * 100, 99.0)
|
||||
return {"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": orientation_saved,
|
||||
"noise_tokens_saved": round(noise_tokens_saved),
|
||||
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
|
||||
"total_reduction_pct": round(reduction_pct, 1)}
|
||||
|
||||
|
||||
def format_eta(elapsed_times, completed, total):
|
||||
if completed == 0:
|
||||
return "ETA: --:--"
|
||||
avg = sum(elapsed_times) / completed
|
||||
eta = timedelta(seconds=int((total - completed) * avg))
|
||||
return f"ETA: {str(eta)}"
|
||||
|
||||
|
||||
def content_hash(text):
|
||||
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||
|
||||
|
||||
def main():
|
||||
test_start = time.time()
|
||||
print(f"\nBirdAI Briefing Generator v2 — Experiment 002b")
|
||||
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
|
||||
print(f"Changes: rule-based doc_type, template stripping, controlled vocab")
|
||||
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Results: {RESULTS_FILE}")
|
||||
print("-" * 75)
|
||||
|
||||
docs = get_sample_documents()
|
||||
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
|
||||
|
||||
results = {
|
||||
"meta": {"model": MODEL, "version": "v2", "sample_size": len(docs),
|
||||
"started": datetime.now().isoformat(), "completed": None,
|
||||
"total_elapsed_seconds": None, "avg_seconds_per_doc": None},
|
||||
"documents": [], "summary": {}
|
||||
}
|
||||
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
pre_classified_count = 0
|
||||
priority_counts = {"full": 0, "partial": 0, "skip": 0}
|
||||
total_reduction_pct = 0.0
|
||||
elapsed_times = []
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
doc_id = doc["id"]
|
||||
content = doc["document"]
|
||||
source = doc.get("source", "unknown")
|
||||
chash = content_hash(content)
|
||||
|
||||
pre_type, cleaned_content = pre_classify_document(source, content)
|
||||
was_pre_classified = pre_type is not None
|
||||
if was_pre_classified:
|
||||
pre_classified_count += 1
|
||||
|
||||
eta_str = format_eta(elapsed_times, i, len(docs))
|
||||
pre_flag = "R" if was_pre_classified else "M"
|
||||
print(f"[{i+1:02d}/{len(docs)}][{pre_flag}] {source[:36]:<36} {eta_str:<14}", end=" ", flush=True)
|
||||
|
||||
prompt = build_briefing_prompt(cleaned_content, pre_type)
|
||||
t_start = time.time()
|
||||
briefing, raw = run_briefing(prompt)
|
||||
elapsed = round(time.time() - t_start, 1)
|
||||
elapsed_times.append(elapsed)
|
||||
|
||||
if briefing is None:
|
||||
failed_count += 1
|
||||
print(f"→ FAILED {elapsed}s | {raw[:50]}")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "status": "FAILED",
|
||||
"pre_classified_type": pre_type, "error": raw, "elapsed_seconds": elapsed
|
||||
})
|
||||
else:
|
||||
briefing = sanitize_briefing(briefing, pre_type)
|
||||
success_count += 1
|
||||
priority = briefing["extraction_priority"]
|
||||
doc_type = briefing["document_type"]
|
||||
density = briefing["density"]
|
||||
priority_counts[priority] = priority_counts.get(priority, 0) + 1
|
||||
reduction = estimate_token_reduction(cleaned_content, briefing)
|
||||
total_reduction_pct += reduction["total_reduction_pct"]
|
||||
print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "cleaned_content_length": len(cleaned_content),
|
||||
"status": "SUCCESS", "pre_classified_type": pre_type,
|
||||
"was_pre_classified": was_pre_classified, "elapsed_seconds": elapsed,
|
||||
"briefing": briefing, "token_reduction_estimate": reduction
|
||||
})
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
total_elapsed = round(time.time() - test_start, 1)
|
||||
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
|
||||
completed_at = datetime.now().isoformat()
|
||||
results["meta"]["completed"] = completed_at
|
||||
results["meta"]["total_elapsed_seconds"] = total_elapsed
|
||||
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
|
||||
|
||||
total = len(docs)
|
||||
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
|
||||
summary = {
|
||||
"total": total, "success": success_count, "failed": failed_count,
|
||||
"success_rate": round(success_count / total * 100, 1),
|
||||
"pre_classified_by_rule": pre_classified_count,
|
||||
"classified_by_model": total - pre_classified_count,
|
||||
"extraction_priority_breakdown": priority_counts,
|
||||
"avg_token_reduction_pct": avg_reduction,
|
||||
"total_elapsed_seconds": total_elapsed, "avg_seconds_per_doc": avg_per_doc,
|
||||
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
|
||||
"approach_viable": success_count / total >= 0.8
|
||||
}
|
||||
results["summary"] = summary
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
print("\n" + "=" * 75)
|
||||
print(f"RESULTS — Briefing Generator v2")
|
||||
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
|
||||
print(f" Failed: {failed_count}")
|
||||
print(f" Pre-classified (rule): {pre_classified_count}")
|
||||
print(f" Classified (model): {total - pre_classified_count}")
|
||||
print(f" Priority — full: {priority_counts.get('full', 0)}")
|
||||
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
|
||||
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
|
||||
print(f" Avg token reduction: {avg_reduction}%")
|
||||
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
|
||||
print(f" Avg per document: {avg_per_doc}s")
|
||||
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
|
||||
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
|
||||
print(f" Completed: {completed_at}")
|
||||
print(f" Full results: {RESULTS_FILE}")
|
||||
print("=" * 75)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BirdAI Briefing Generator Test
|
||||
===============================
|
||||
Tests the local LLM as a document briefing generator.
|
||||
The local model produces a structured roadmap for the API —
|
||||
cleaning, structure detection, signal flagging — without semantic judgment.
|
||||
Results written to ~/aaronai/briefing_test_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import hashlib
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json")
|
||||
MODEL = "mistral"
|
||||
SAMPLE_SIZE = 50
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
|
||||
VALID_DOC_TYPES = {
|
||||
"academic_pdf", "technical_doc", "chat_log", "code",
|
||||
"presentation", "book_excerpt", "form", "syllabus",
|
||||
"email", "notes", "unknown"
|
||||
}
|
||||
VALID_DENSITIES = {"high", "medium", "low"}
|
||||
VALID_PRIORITIES = {"full", "partial", "skip"}
|
||||
|
||||
BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
|
||||
|
||||
Return exactly this structure:
|
||||
{
|
||||
"document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown",
|
||||
"primary_language": "language code e.g. en, fr, de",
|
||||
"density": "one of: high, medium, low",
|
||||
"has_proper_nouns": true or false,
|
||||
"has_dates": true or false,
|
||||
"has_numeric_data": true or false,
|
||||
"has_institutional_language": true or false,
|
||||
"has_technical_terms": true or false,
|
||||
"likely_has_named_entities": true or false,
|
||||
"structure_signals": [],
|
||||
"noise_signals": [],
|
||||
"extraction_priority": "one of: full, partial, skip"
|
||||
}
|
||||
|
||||
Rules:
|
||||
- document_type: identify from formatting patterns and vocabulary, not meaning
|
||||
- density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse
|
||||
- has_proper_nouns: true if you see capitalized words that are not sentence starts
|
||||
- has_dates: true if you see date patterns (numbers with months, years, slashes)
|
||||
- has_numeric_data: true if you see measurements, percentages, statistics
|
||||
- has_institutional_language: true if you see words like university, department, policy, committee, grant
|
||||
- has_technical_terms: true if you see domain-specific jargon or acronyms
|
||||
- likely_has_named_entities: true if has_proper_nouns is true
|
||||
- structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"]
|
||||
- noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"]
|
||||
- extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_sample_documents():
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ON (source) id, document, source, created_at
|
||||
FROM embeddings
|
||||
WHERE length(document) > 100
|
||||
AND length(document) < 3000
|
||||
ORDER BY source, random()
|
||||
LIMIT %s
|
||||
""", (SAMPLE_SIZE,))
|
||||
docs = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return docs
|
||||
|
||||
|
||||
def run_briefing(text):
|
||||
prompt = BRIEFING_PROMPT + text[:1500]
|
||||
payload = json.dumps({
|
||||
"model": MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}).encode()
|
||||
raw = ""
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
OLLAMA_URL,
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
result = json.loads(resp.read().decode())
|
||||
raw = result.get("response", "").strip()
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}") + 1
|
||||
if start == -1 or end == 0:
|
||||
return None, f"NO_JSON: {raw[:200]}"
|
||||
json_str = raw[start:end]
|
||||
parsed = json.loads(json_str)
|
||||
if not isinstance(parsed, dict):
|
||||
return None, f"NOT_DICT: {raw[:100]}"
|
||||
return parsed, raw
|
||||
except urllib.error.URLError as e:
|
||||
return None, f"URL_ERROR: {e}"
|
||||
except TimeoutError:
|
||||
return None, "TIMEOUT"
|
||||
except json.JSONDecodeError as e:
|
||||
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
|
||||
except Exception as e:
|
||||
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def sanitize_briefing(briefing):
|
||||
safe = {}
|
||||
dt = str(briefing.get("document_type", "unknown")).lower().strip()
|
||||
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
|
||||
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
|
||||
density = str(briefing.get("density", "medium")).lower().strip()
|
||||
safe["density"] = density if density in VALID_DENSITIES else "medium"
|
||||
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
|
||||
"has_institutional_language", "has_technical_terms",
|
||||
"likely_has_named_entities"]:
|
||||
val = briefing.get(field, False)
|
||||
if isinstance(val, bool):
|
||||
safe[field] = val
|
||||
elif isinstance(val, str):
|
||||
safe[field] = val.lower() in ("true", "yes", "1")
|
||||
else:
|
||||
safe[field] = bool(val)
|
||||
for field in ["structure_signals", "noise_signals"]:
|
||||
val = briefing.get(field, [])
|
||||
if isinstance(val, list):
|
||||
safe[field] = [str(v) for v in val if v]
|
||||
elif isinstance(val, str):
|
||||
safe[field] = [val] if val else []
|
||||
else:
|
||||
safe[field] = []
|
||||
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
|
||||
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
|
||||
return safe
|
||||
|
||||
|
||||
def estimate_token_reduction(original_text, briefing):
|
||||
original_tokens = max(len(original_text) / 4, 1)
|
||||
orientation_saved = 200
|
||||
if briefing.get("extraction_priority") == "skip":
|
||||
return {
|
||||
"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": round(original_tokens + 200),
|
||||
"noise_reduction_pct": 100.0,
|
||||
"total_reduction_pct": 100.0,
|
||||
"note": "skip — no API call"
|
||||
}
|
||||
noise_count = len(briefing.get("noise_signals", []))
|
||||
noise_reduction_pct = min(noise_count * 0.05, 0.40)
|
||||
noise_tokens_saved = original_tokens * noise_reduction_pct
|
||||
total_saved = orientation_saved + noise_tokens_saved
|
||||
total_cost = original_tokens + 200
|
||||
reduction_pct = min((total_saved / total_cost) * 100, 99.0)
|
||||
return {
|
||||
"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": orientation_saved,
|
||||
"noise_tokens_saved": round(noise_tokens_saved),
|
||||
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
|
||||
"total_reduction_pct": round(reduction_pct, 1)
|
||||
}
|
||||
|
||||
|
||||
def format_eta(elapsed_times, completed, total):
|
||||
if completed == 0:
|
||||
return "ETA: --:--"
|
||||
avg = sum(elapsed_times) / completed
|
||||
remaining = (total - completed) * avg
|
||||
eta = timedelta(seconds=int(remaining))
|
||||
return f"ETA: {str(eta)}"
|
||||
|
||||
|
||||
def content_hash(text):
|
||||
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||
|
||||
|
||||
def main():
|
||||
test_start = time.time()
|
||||
print(f"\nBirdAI Briefing Generator Test")
|
||||
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
|
||||
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Results: {RESULTS_FILE}")
|
||||
print("-" * 75)
|
||||
|
||||
docs = get_sample_documents()
|
||||
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
|
||||
|
||||
results = {
|
||||
"meta": {
|
||||
"model": MODEL,
|
||||
"sample_size": len(docs),
|
||||
"started": datetime.now().isoformat(),
|
||||
"completed": None,
|
||||
"total_elapsed_seconds": None,
|
||||
"avg_seconds_per_doc": None
|
||||
},
|
||||
"documents": [],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
priority_counts = {"full": 0, "partial": 0, "skip": 0}
|
||||
total_reduction_pct = 0.0
|
||||
elapsed_times = []
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
doc_id = doc["id"]
|
||||
content = doc["document"]
|
||||
source = doc.get("source", "unknown")
|
||||
chash = content_hash(content)
|
||||
eta_str = format_eta(elapsed_times, i, len(docs))
|
||||
print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True)
|
||||
|
||||
t_start = time.time()
|
||||
briefing, raw = run_briefing(content)
|
||||
elapsed = round(time.time() - t_start, 1)
|
||||
elapsed_times.append(elapsed)
|
||||
|
||||
if briefing is None:
|
||||
failed_count += 1
|
||||
print(f"→ FAILED {elapsed}s | {raw[:50]}")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "status": "FAILED",
|
||||
"error": raw, "elapsed_seconds": elapsed
|
||||
})
|
||||
else:
|
||||
briefing = sanitize_briefing(briefing)
|
||||
success_count += 1
|
||||
priority = briefing["extraction_priority"]
|
||||
doc_type = briefing["document_type"]
|
||||
density = briefing["density"]
|
||||
priority_counts[priority] = priority_counts.get(priority, 0) + 1
|
||||
reduction = estimate_token_reduction(content, briefing)
|
||||
total_reduction_pct += reduction["total_reduction_pct"]
|
||||
print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "status": "SUCCESS",
|
||||
"elapsed_seconds": elapsed, "briefing": briefing,
|
||||
"token_reduction_estimate": reduction
|
||||
})
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
total_elapsed = round(time.time() - test_start, 1)
|
||||
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
|
||||
completed_at = datetime.now().isoformat()
|
||||
results["meta"]["completed"] = completed_at
|
||||
results["meta"]["total_elapsed_seconds"] = total_elapsed
|
||||
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
|
||||
|
||||
total = len(docs)
|
||||
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
|
||||
|
||||
summary = {
|
||||
"total": total,
|
||||
"success": success_count,
|
||||
"failed": failed_count,
|
||||
"success_rate": round(success_count / total * 100, 1),
|
||||
"extraction_priority_breakdown": priority_counts,
|
||||
"avg_token_reduction_pct": avg_reduction,
|
||||
"total_elapsed_seconds": total_elapsed,
|
||||
"avg_seconds_per_doc": avg_per_doc,
|
||||
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
|
||||
"approach_viable": success_count / total >= 0.8
|
||||
}
|
||||
results["summary"] = summary
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
print("\n" + "=" * 75)
|
||||
print(f"RESULTS")
|
||||
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
|
||||
print(f" Failed: {failed_count}")
|
||||
print(f" Priority — full: {priority_counts.get('full', 0)}")
|
||||
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
|
||||
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
|
||||
print(f" Avg token reduction: {avg_reduction}%")
|
||||
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
|
||||
print(f" Avg per document: {avg_per_doc}s")
|
||||
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
|
||||
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
|
||||
print(f" Completed: {completed_at}")
|
||||
print(f" Full results: {RESULTS_FILE}")
|
||||
print("=" * 75)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,508 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cascade Optimization Test — skip-small + compressed-draft
|
||||
|
||||
Tests whether two optimizations on the entity-drafter cascade meaningfully
|
||||
improve the savings ceiling beyond the prior unoptimized cascade (12.66%).
|
||||
|
||||
Optimizations:
|
||||
A — Skip-small-docs routing: docs <1000 chars bypass the local pass entirely
|
||||
B — Compressed draft format: bare JSON array instead of markdown bullets
|
||||
|
||||
Conditions:
|
||||
A — Baseline: single Claude Haiku call, full extraction (unchanged from prior)
|
||||
B — Optimized cascade: skip-small + compressed draft, otherwise same cascade
|
||||
|
||||
Sample: 30 docs from briefing_test_v2_results.json:
|
||||
- 10 small (<1000 chars) — should show 0% delta if skip-small works
|
||||
- 12 medium (1000-5000 chars) — primary test bucket
|
||||
- 8 large (5000-12000 chars, capped at 12K)
|
||||
|
||||
Mistral context: 12K (raised from 8K in prior run).
|
||||
|
||||
Outputs: ~/aaronai/experiments/cascade_optimization_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_optimization_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 4096
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 180 # raised — 12K context can take longer
|
||||
MAX_DOC_CHARS = 12000 # raised from 8K
|
||||
SKIP_SMALL_THRESHOLD = 1000
|
||||
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{
|
||||
"candidates": [string]
|
||||
}
|
||||
|
||||
Just names. No types, no relationships. JSON only.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
# Compressed draft format — bare JSON array, minimal preamble
|
||||
CONDITION_B_API_PROMPT_COMPRESSED = """Extract a knowledge graph from the document below.
|
||||
|
||||
Local model entity candidates (hint, not authoritative — verify against the document, ignore false ones, add missed ones):
|
||||
{local_draft_json}
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 12288},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None
|
||||
if not isinstance(data, dict):
|
||||
return None, None
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return len(ents), len(edges)
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_candidates(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
cands = data.get("candidates")
|
||||
if isinstance(cands, list):
|
||||
return [str(c).strip() for c in cands if c]
|
||||
return None
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Pick 10 small / 12 medium / 8 large by character length, in file order."""
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000]
|
||||
large = [d for d, n in sized if n >= 5000]
|
||||
return small[:10] + medium[:12] + large[:8]
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (10s/12m/8l, file order)")
|
||||
print(f"Skip-small threshold: <{SKIP_SMALL_THRESHOLD} chars")
|
||||
print(f"Mistral context: 12288 tokens, doc cap {MAX_DOC_CHARS} chars")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:55]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
skip_small_routed = sent_len < SKIP_SMALL_THRESHOLD
|
||||
trunc_marker = "*" if truncated else " "
|
||||
route_marker = "[skip-small]" if skip_small_routed else "[cascade] "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] "
|
||||
f"{route_marker} {source[:50]}", flush=True)
|
||||
|
||||
# Condition A — always runs
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges = parse_graph(a["response_text"])
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_ents = a_edges = None
|
||||
|
||||
# Condition B
|
||||
if skip_small_routed:
|
||||
# Skip-small: B = A. Same call, no local pass.
|
||||
print(f" B: routed to baseline (skip-small)", flush=True)
|
||||
b = a
|
||||
b_ents = a_ents
|
||||
b_edges = a_edges
|
||||
local_result = {"skipped": "skip_small_routed"}
|
||||
local_candidates = []
|
||||
local_raw = ""
|
||||
else:
|
||||
local_result = call_local(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']} — recording skip", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"skip_small_routed": False,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
cands = parse_candidates(local_raw)
|
||||
local_candidates = cands or []
|
||||
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
|
||||
flush=True)
|
||||
|
||||
if not local_candidates:
|
||||
print(f" B local: empty draft — skipping API call", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"skip_small_routed": False,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_draft_empty",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
# Compressed draft format — bare JSON array
|
||||
local_draft_json = json.dumps(local_candidates, ensure_ascii=False)
|
||||
b_prompt = CONDITION_B_API_PROMPT_COMPRESSED.replace("{local_draft_json}", local_draft_json) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges = parse_graph(b["response_text"])
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_ents = b_edges = None
|
||||
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
if a_edges and b_edges is not None and a_edges > 0:
|
||||
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
|
||||
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"skip_small_routed": skip_small_routed,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skip_small_routed": skip_small_routed,
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_candidates": local_candidates,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"entity_count": b_ents,
|
||||
"edge_count": b_edges,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:4000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("input_tokens") is not None
|
||||
and r.get("condition_b", {}).get("api_input_tokens") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
|
||||
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
|
||||
skip_count = sum(1 for r in rows if r.get("skip_small_routed"))
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"n_skip_small_routed": skip_count,
|
||||
"n_cascade": len(rows) - skip_count,
|
||||
"a_input_tokens": ai,
|
||||
"a_output_tokens": ao,
|
||||
"b_input_tokens": bi,
|
||||
"b_output_tokens": bo,
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
|
||||
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "cascade_optimization_test",
|
||||
"title": "Cascade Optimization — skip-small + compressed-draft",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"haiku_temperature": HAIKU_TEMPERATURE,
|
||||
"haiku_max_tokens": HAIKU_MAX_TOKENS,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"skip_small_threshold": SKIP_SMALL_THRESHOLD,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"n_skipped": len(sample) - len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"prior_unoptimized_cascade_pct": -12.66,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
opt_delta = delta_pct - (-12.66)
|
||||
print(f"Optimization delta vs prior cascade: {opt_delta:+.2f} points "
|
||||
f"(prior was -12.66%)")
|
||||
print()
|
||||
print("By size bucket:")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}, skip={stats['n_skip_small_routed']}): "
|
||||
f"in {stats['input_delta_pct']:+.1f}% "
|
||||
f"out {stats['output_delta_pct']:+.1f}% "
|
||||
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print()
|
||||
print("Results: " + str(OUTPUT_FILE))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,485 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cascade Test — Nodes-vs-Edges Experiment
|
||||
|
||||
Tests whether splitting graph extraction into "local drafts entity candidates,
|
||||
API verifies + draws edges" reduces total API cost vs single-shot full
|
||||
extraction, while producing a comparable graph.
|
||||
|
||||
Two conditions per document:
|
||||
A — Baseline: single Claude Haiku call, full extraction
|
||||
B — Cascade: Mistral lists entity candidates, then Haiku does verify+edges
|
||||
|
||||
Both conditions:
|
||||
- See the full document (parity-respecting)
|
||||
- Use open entity type vocabulary (no fixed schema)
|
||||
- Use natural-language predicates (no constrained relations)
|
||||
- Same target output schema, same temperature
|
||||
|
||||
Sample: 20 docs from briefing_test_v2_results.json, stratified by char length.
|
||||
Reports API cost only. Local Mistral time is recorded but not monetized
|
||||
(ran on the VPS, no per-token API charge).
|
||||
|
||||
Outputs: ~/aaronai/experiments/cascade_test_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
V2_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "cascade_test_results.json"
|
||||
HAIKU_MODEL = "claude-haiku-4-5-20251001"
|
||||
HAIKU_MAX_TOKENS = 4096
|
||||
HAIKU_TEMPERATURE = 0.0
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
LOCAL_MODEL = "mistral"
|
||||
LOCAL_TIMEOUT = 120
|
||||
MAX_DOC_CHARS = 8000
|
||||
|
||||
# Verified pricing 2026-04-28 against Anthropic docs
|
||||
HAIKU_IN_PER_M = 1.0
|
||||
HAIKU_OUT_PER_M = 5.0
|
||||
|
||||
|
||||
CONDITION_A_PROMPT = """Extract a knowledge graph from the document below.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits the entity. Do not constrain yourself to a fixed list.
|
||||
|
||||
Edge predicates: natural language phrases that capture the actual relationship the document states or implies.
|
||||
|
||||
Extract every entity and every relationship the document states or strongly implies. Both subject and object in every edge must appear in entities. JSON only, no commentary, no markdown fences.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
LOCAL_PROMPT = """List every named entity that appears in the document below — every person, organization, place, project, document, material, technique, date, event, or other named thing.
|
||||
|
||||
Return ONLY valid JSON:
|
||||
{
|
||||
"candidates": [string]
|
||||
}
|
||||
|
||||
Just names. No types, no relationships. JSON only.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
CONDITION_B_API_PROMPT_WITH_DRAFT = """Extract a knowledge graph from the document below.
|
||||
|
||||
A local model has identified entity candidates that may help orient your reading. Treat the candidates as a hint, not as truth — verify each candidate appears in the document, ignore any that do not, and add any entities the candidates missed.
|
||||
|
||||
Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"entities": [
|
||||
{"name": string, "type": string}
|
||||
],
|
||||
"edges": [
|
||||
{"subject": string, "predicate": string, "object": string}
|
||||
]
|
||||
}
|
||||
|
||||
Entity types: use whatever fits. Edge predicates: natural language phrases capturing the actual relationship. Both subject and object in every edge must appear in entities. Extract every entity and every relationship the document states or strongly implies. JSON only, no commentary, no markdown fences.
|
||||
|
||||
ENTITY CANDIDATES FROM LOCAL MODEL:
|
||||
{local_draft}
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def strip_json_fences(text):
|
||||
if not text:
|
||||
return ""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```(?:json)?\s*", "", t)
|
||||
t = re.sub(r"\s*```$", "", t)
|
||||
return t.strip()
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None, 0
|
||||
full = "\n\n".join(r[0] for r in rows)
|
||||
return full[:MAX_DOC_CHARS], len(full)
|
||||
|
||||
|
||||
def call_haiku(client, prompt_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=HAIKU_MODEL,
|
||||
max_tokens=HAIKU_MAX_TOKENS,
|
||||
temperature=HAIKU_TEMPERATURE,
|
||||
messages=[{"role": "user", "content": prompt_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def call_local(document_text):
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": LOCAL_MODEL,
|
||||
"prompt": LOCAL_PROMPT + document_text,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 1024, "temperature": 0, "num_ctx": 8192},
|
||||
},
|
||||
timeout=LOCAL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_graph(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None, None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None, None
|
||||
if not isinstance(data, dict):
|
||||
return None, None
|
||||
ents = data.get("entities")
|
||||
edges = data.get("edges")
|
||||
if isinstance(ents, list) and isinstance(edges, list):
|
||||
return len(ents), len(edges)
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_candidates(raw):
|
||||
cleaned = strip_json_fences(raw)
|
||||
if not cleaned:
|
||||
return None
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
cands = data.get("candidates")
|
||||
if isinstance(cands, list):
|
||||
return [str(c).strip() for c in cands if c]
|
||||
return None
|
||||
|
||||
|
||||
def stratify(docs):
|
||||
"""Pick 5 small / 10 medium / 5 large by character length, in file order."""
|
||||
sized = [(d, d["content_length"]) for d in docs]
|
||||
small = [d for d, n in sized if n < 1000]
|
||||
medium = [d for d, n in sized if 1000 <= n < 5000]
|
||||
large = [d for d, n in sized if n >= 5000]
|
||||
return small[:5] + medium[:10] + large[:5]
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not api_key or not pg_dsn:
|
||||
print("ERROR: ANTHROPIC_API_KEY or PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not V2_FILE.exists():
|
||||
print(f"ERROR: {V2_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
with open(V2_FILE) as f:
|
||||
v2 = json.load(f)
|
||||
|
||||
docs_meta = [d for d in v2["documents"] if d.get("status") == "SUCCESS"]
|
||||
sample = stratify(docs_meta)
|
||||
print(f"Sample: {len(sample)} docs (stratified by char length, file order)")
|
||||
for d in sample:
|
||||
print(f" [{d['content_length']:>6}c] {d['source'][:60]}")
|
||||
print(f"Haiku model: {HAIKU_MODEL} temp={HAIKU_TEMPERATURE} max_tokens={HAIKU_MAX_TOKENS}")
|
||||
print(f"Local model: {LOCAL_MODEL}")
|
||||
print()
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc_meta in enumerate(sample, 1):
|
||||
source = doc_meta["source"]
|
||||
doc_text, original_len = fetch_document_text(pg_conn, source)
|
||||
if not doc_text:
|
||||
print(f"[{i:02d}/{len(sample)}] {source[:60]} — SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
sent_len = len(doc_text)
|
||||
truncated = original_len > sent_len
|
||||
size_bucket = (
|
||||
"small" if sent_len < 1000
|
||||
else "medium" if sent_len < 5000
|
||||
else "large"
|
||||
)
|
||||
trunc_marker = "*" if truncated else " "
|
||||
print(f"[{i:02d}/{len(sample)}] [{size_bucket:6s}] [{sent_len:>5}c{trunc_marker}] {source[:55]}", flush=True)
|
||||
|
||||
# Condition A
|
||||
try:
|
||||
a = call_haiku(client, CONDITION_A_PROMPT + doc_text)
|
||||
a_ents, a_edges = parse_graph(a["response_text"])
|
||||
print(f" A: in={a['input_tokens']} out={a['output_tokens']} "
|
||||
f"ents={a_ents} edges={a_edges} stop={a['stop_reason']} t={a['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" A FAILED: {e}", flush=True)
|
||||
a = {"error": str(e)}
|
||||
a_ents = a_edges = None
|
||||
|
||||
# Condition B local pass
|
||||
local_result = call_local(doc_text)
|
||||
if "error" in local_result:
|
||||
print(f" B local FAILED: {local_result['error']} — skipping doc", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_model_failed",
|
||||
"local_error": local_result["error"],
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_raw = local_result["response"]
|
||||
cands = parse_candidates(local_raw)
|
||||
local_candidates = cands or []
|
||||
print(f" B local: t={local_result['latency_s']}s candidates={len(local_candidates)}",
|
||||
flush=True)
|
||||
|
||||
if not local_candidates:
|
||||
print(f" B local: empty draft — skipping API call to avoid asymmetric test", flush=True)
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"skipped": "local_draft_empty",
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_raw": local_raw[:1000],
|
||||
},
|
||||
})
|
||||
continue
|
||||
|
||||
local_draft_str = "\n".join(f"- {c}" for c in local_candidates)
|
||||
b_prompt = CONDITION_B_API_PROMPT_WITH_DRAFT.replace("{local_draft}", local_draft_str) + doc_text
|
||||
|
||||
try:
|
||||
b = call_haiku(client, b_prompt)
|
||||
b_ents, b_edges = parse_graph(b["response_text"])
|
||||
print(f" B api: in={b['input_tokens']} out={b['output_tokens']} "
|
||||
f"ents={b_ents} edges={b_edges} stop={b['stop_reason']} t={b['latency_s']}s",
|
||||
flush=True)
|
||||
except Exception as e:
|
||||
print(f" B api FAILED: {e}", flush=True)
|
||||
b = {"error": str(e)}
|
||||
b_ents = b_edges = None
|
||||
|
||||
if "input_tokens" in a and "input_tokens" in b:
|
||||
in_pct = (b["input_tokens"] - a["input_tokens"]) / a["input_tokens"] * 100 if a["input_tokens"] else 0.0
|
||||
out_pct = (b["output_tokens"] - a["output_tokens"]) / a["output_tokens"] * 100 if a["output_tokens"] else 0.0
|
||||
edge_pct_str = "n/a"
|
||||
if a_edges and b_edges is not None and a_edges > 0:
|
||||
edge_pct_str = f"{(b_edges - a_edges) / a_edges * 100:+.1f}%"
|
||||
print(f" Δ input={in_pct:+.1f}% output={out_pct:+.1f}% edges={edge_pct_str}", flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"size_bucket": size_bucket,
|
||||
"doc_chars_original": original_len,
|
||||
"doc_chars_sent": sent_len,
|
||||
"truncated": truncated,
|
||||
"condition_a": {
|
||||
"input_tokens": a.get("input_tokens"),
|
||||
"output_tokens": a.get("output_tokens"),
|
||||
"latency_s": a.get("latency_s"),
|
||||
"entity_count": a_ents,
|
||||
"edge_count": a_edges,
|
||||
"stop_reason": a.get("stop_reason"),
|
||||
"response_text": a.get("response_text", "")[:4000],
|
||||
"error": a.get("error"),
|
||||
},
|
||||
"condition_b": {
|
||||
"local_latency_s": local_result.get("latency_s"),
|
||||
"local_candidates": local_candidates,
|
||||
"local_raw": local_raw[:1000],
|
||||
"api_input_tokens": b.get("input_tokens"),
|
||||
"api_output_tokens": b.get("output_tokens"),
|
||||
"api_latency_s": b.get("latency_s"),
|
||||
"entity_count": b_ents,
|
||||
"edge_count": b_edges,
|
||||
"stop_reason": b.get("stop_reason"),
|
||||
"response_text": b.get("response_text", "")[:4000],
|
||||
"error": b.get("error"),
|
||||
},
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results
|
||||
if r.get("condition_a", {}).get("input_tokens") is not None
|
||||
and r.get("condition_b", {}).get("api_input_tokens") is not None]
|
||||
|
||||
a_in = sum(r["condition_a"]["input_tokens"] for r in valid)
|
||||
a_out = sum(r["condition_a"]["output_tokens"] for r in valid)
|
||||
b_in = sum(r["condition_b"]["api_input_tokens"] for r in valid)
|
||||
b_out = sum(r["condition_b"]["api_output_tokens"] for r in valid)
|
||||
a_cost = (a_in * HAIKU_IN_PER_M + a_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
b_cost = (b_in * HAIKU_IN_PER_M + b_out * HAIKU_OUT_PER_M) / 1_000_000
|
||||
|
||||
by_bucket = {}
|
||||
for bucket in ("small", "medium", "large"):
|
||||
rows = [r for r in valid if r["size_bucket"] == bucket]
|
||||
if not rows:
|
||||
by_bucket[bucket] = None
|
||||
continue
|
||||
ai = sum(r["condition_a"]["input_tokens"] for r in rows)
|
||||
ao = sum(r["condition_a"]["output_tokens"] for r in rows)
|
||||
bi = sum(r["condition_b"]["api_input_tokens"] for r in rows)
|
||||
bo = sum(r["condition_b"]["api_output_tokens"] for r in rows)
|
||||
ae = [r["condition_a"]["edge_count"] for r in rows if r["condition_a"]["edge_count"] is not None]
|
||||
be = [r["condition_b"]["edge_count"] for r in rows if r["condition_b"]["edge_count"] is not None]
|
||||
by_bucket[bucket] = {
|
||||
"n": len(rows),
|
||||
"a_input_tokens": ai,
|
||||
"a_output_tokens": ao,
|
||||
"b_input_tokens": bi,
|
||||
"b_output_tokens": bo,
|
||||
"input_delta_pct": round((bi - ai) / ai * 100, 2) if ai else None,
|
||||
"output_delta_pct": round((bo - ao) / ao * 100, 2) if ao else None,
|
||||
"a_avg_edges": round(statistics.mean(ae), 1) if ae else None,
|
||||
"b_avg_edges": round(statistics.mean(be), 1) if be else None,
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "cascade_test",
|
||||
"title": "Nodes-vs-Edges Cascade Experiment",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"haiku_model": HAIKU_MODEL,
|
||||
"haiku_temperature": HAIKU_TEMPERATURE,
|
||||
"haiku_max_tokens": HAIKU_MAX_TOKENS,
|
||||
"local_model": LOCAL_MODEL,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(sample),
|
||||
"n_valid_pairs": len(valid),
|
||||
"n_skipped": len(sample) - len(valid),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"totals": {
|
||||
"a_input_tokens": a_in,
|
||||
"a_output_tokens": a_out,
|
||||
"b_input_tokens": b_in,
|
||||
"b_output_tokens": b_out,
|
||||
"a_cost_usd": round(a_cost, 4),
|
||||
"b_cost_usd": round(b_cost, 4),
|
||||
"cost_delta_usd": round(b_cost - a_cost, 4),
|
||||
"cost_delta_pct": round((b_cost - a_cost) / a_cost * 100, 2) if a_cost else None,
|
||||
"note": "API cost only — local Mistral runtime on VPS not monetized",
|
||||
},
|
||||
"by_size_bucket": by_bucket,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(sample)} valid pairs in {total_elapsed}s")
|
||||
print(f"A total cost: ${a_cost:.4f} (in={a_in} out={a_out})")
|
||||
print(f"B total cost: ${b_cost:.4f} (in={b_in} out={b_out})")
|
||||
delta_pct = summary['totals']['cost_delta_pct']
|
||||
if delta_pct is not None:
|
||||
verdict = "B cheaper" if delta_pct < 0 else "B more expensive"
|
||||
print(f"Cost delta: {delta_pct:+.2f}% ({verdict})")
|
||||
print()
|
||||
print("By size bucket:")
|
||||
for bucket, stats in by_bucket.items():
|
||||
if stats:
|
||||
print(f" {bucket:6s} (n={stats['n']}): "
|
||||
f"in {stats['input_delta_pct']:+.1f}% "
|
||||
f"out {stats['output_delta_pct']:+.1f}% "
|
||||
f"edges A={stats['a_avg_edges']} B={stats['b_avg_edges']}")
|
||||
print()
|
||||
print(f"NOTE: API cost only. Local Mistral runtime is not monetized.")
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BirdAI Cascaded Extraction — Consistency Test
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import hashlib
|
||||
import time
|
||||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
RESULTS_FILE = os.path.expanduser("~/aaronai/consistency_test_results.json")
|
||||
MODEL = "mistral"
|
||||
PASSES = 3
|
||||
SAMPLE_SIZE = 50
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
|
||||
EXTRACTION_PROMPT = """Extract named entities from this text. Return JSON only, no explanation, no prose.
|
||||
Use exactly these fields (omit any field you are uncertain about, use empty list if none found):
|
||||
{
|
||||
"people": [],
|
||||
"organizations": [],
|
||||
"locations": [],
|
||||
"dates": [],
|
||||
"document_type": ""
|
||||
}
|
||||
Rules:
|
||||
- Every value in people, organizations, locations, dates must be a plain string
|
||||
- document_type must be a plain string
|
||||
- No nested objects, no nested lists
|
||||
- Only include entities you are certain about
|
||||
- If uncertain about anything, omit it
|
||||
Text: """
|
||||
|
||||
|
||||
def get_sample_documents():
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""
|
||||
SELECT id, document, source, created_at
|
||||
FROM embeddings
|
||||
WHERE length(document) > 100
|
||||
AND length(document) < 3000
|
||||
ORDER BY random()
|
||||
LIMIT %s
|
||||
""", (SAMPLE_SIZE,))
|
||||
docs = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return docs
|
||||
|
||||
|
||||
def run_extraction(text):
|
||||
prompt = EXTRACTION_PROMPT + text[:1500]
|
||||
payload = json.dumps({
|
||||
"model": MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}).encode()
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
OLLAMA_URL,
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
result = json.loads(resp.read().decode())
|
||||
raw = result.get("response", "").strip()
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}") + 1
|
||||
if start == -1 or end == 0:
|
||||
return None, f"NO_JSON: {raw[:100]}"
|
||||
json_str = raw[start:end]
|
||||
parsed = json.loads(json_str)
|
||||
if not isinstance(parsed, dict):
|
||||
return None, f"NOT_DICT: {json_str[:100]}"
|
||||
return parsed, raw
|
||||
except urllib.error.URLError as e:
|
||||
return None, f"URL_ERROR: {e}"
|
||||
except TimeoutError:
|
||||
return None, "TIMEOUT"
|
||||
except json.JSONDecodeError as e:
|
||||
return None, f"JSON_ERROR: {e}"
|
||||
except Exception as e:
|
||||
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def flatten_value(v):
|
||||
if isinstance(v, str):
|
||||
return v.lower().strip()
|
||||
elif isinstance(v, dict):
|
||||
return json.dumps(v, sort_keys=True).lower()
|
||||
elif isinstance(v, list):
|
||||
return json.dumps(sorted([flatten_value(i) for i in v]))
|
||||
else:
|
||||
return str(v).lower().strip()
|
||||
|
||||
|
||||
def normalize_extraction(extracted):
|
||||
if extracted is None:
|
||||
return None
|
||||
normalized = {}
|
||||
expected_fields = ["people", "organizations", "locations", "dates", "document_type"]
|
||||
for key in expected_fields:
|
||||
val = extracted.get(key, [] if key != "document_type" else "")
|
||||
if isinstance(val, list):
|
||||
normalized[key] = sorted([flatten_value(v) for v in val])
|
||||
else:
|
||||
normalized[key] = flatten_value(val)
|
||||
return normalized
|
||||
|
||||
|
||||
def extractions_consistent(extractions):
|
||||
if any(e is None for e in extractions):
|
||||
return False
|
||||
normalized = [normalize_extraction(e) for e in extractions]
|
||||
if any(n is None for n in normalized):
|
||||
return False
|
||||
return all(n == normalized[0] for n in normalized[1:])
|
||||
|
||||
|
||||
def content_hash(text):
|
||||
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||
|
||||
|
||||
def main():
|
||||
print(f"\nBirdAI Consistency Test")
|
||||
print(f"Model: {MODEL} | Passes: {PASSES} | Sample: {SAMPLE_SIZE} docs")
|
||||
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Results: {RESULTS_FILE}")
|
||||
print("-" * 60)
|
||||
|
||||
docs = get_sample_documents()
|
||||
print(f"Loaded {len(docs)} documents from pgvector\n")
|
||||
|
||||
results = {
|
||||
"meta": {
|
||||
"model": MODEL,
|
||||
"passes": PASSES,
|
||||
"sample_size": len(docs),
|
||||
"started": datetime.now().isoformat(),
|
||||
"completed": None
|
||||
},
|
||||
"documents": [],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
consistent_count = 0
|
||||
failed_count = 0
|
||||
timeout_count = 0
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
doc_id = doc["id"]
|
||||
content = doc["document"]
|
||||
source = doc.get("source", "unknown")
|
||||
chash = content_hash(content)
|
||||
|
||||
print(f"[{i+1:02d}/{len(docs)}] {source[:50]:<50} hash:{chash}", end=" ", flush=True)
|
||||
|
||||
passes = []
|
||||
pass_times = []
|
||||
raw_outputs = []
|
||||
|
||||
for p in range(PASSES):
|
||||
t_start = time.time()
|
||||
extracted, raw = run_extraction(content)
|
||||
t_end = time.time()
|
||||
passes.append(extracted)
|
||||
pass_times.append(round(t_end - t_start, 1))
|
||||
raw_outputs.append(raw[:200] if raw else "")
|
||||
|
||||
consistent = extractions_consistent(passes)
|
||||
any_timeout = any("TIMEOUT" in str(r) for r in raw_outputs)
|
||||
any_failed = any(p is None for p in passes)
|
||||
|
||||
if any_timeout:
|
||||
timeout_count += 1
|
||||
status = "TIMEOUT"
|
||||
elif any_failed:
|
||||
failed_count += 1
|
||||
status = "FAILED"
|
||||
elif consistent:
|
||||
consistent_count += 1
|
||||
status = "CONSISTENT"
|
||||
else:
|
||||
status = "INCONSISTENT"
|
||||
|
||||
print(f"→ {status} ({'/'.join(str(t) for t in pass_times)}s)")
|
||||
|
||||
try:
|
||||
sample_extraction = normalize_extraction(passes[0]) if passes[0] else None
|
||||
except Exception:
|
||||
sample_extraction = None
|
||||
|
||||
results["documents"].append({
|
||||
"id": doc_id,
|
||||
"source": source,
|
||||
"content_hash": chash,
|
||||
"content_length": len(content),
|
||||
"status": status,
|
||||
"consistent": consistent,
|
||||
"pass_times_seconds": pass_times,
|
||||
"extraction_sample": sample_extraction,
|
||||
"raw_samples": raw_outputs
|
||||
})
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
total = len(docs)
|
||||
completed_at = datetime.now().isoformat()
|
||||
results["meta"]["completed"] = completed_at
|
||||
|
||||
summary = {
|
||||
"total": total,
|
||||
"consistent": consistent_count,
|
||||
"inconsistent": total - consistent_count - failed_count - timeout_count,
|
||||
"failed": failed_count,
|
||||
"timeout": timeout_count,
|
||||
"consistency_rate": round(consistent_count / total * 100, 1),
|
||||
"cascade_viable": consistent_count / total >= 0.5
|
||||
}
|
||||
results["summary"] = summary
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"RESULTS")
|
||||
print(f" Consistent: {consistent_count}/{total} ({summary['consistency_rate']}%)")
|
||||
print(f" Inconsistent: {summary['inconsistent']}")
|
||||
print(f" Failed/Timeout: {failed_count + timeout_count}")
|
||||
print(f" Cascade viable: {'YES' if summary['cascade_viable'] else 'NO — reconsider architecture'}")
|
||||
print(f" Completed: {completed_at}")
|
||||
print(f" Full results: {RESULTS_FILE}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Experiment 003 — Entity-Only Consistency Test
|
||||
|
||||
Three Mistral passes per document, measure consistency on entity fields only
|
||||
(people, organizations, locations, dates). Excludes document_type label.
|
||||
DISTINCT ON (source) sampling — fixes Exp 001 chunk-replacement flaw.
|
||||
|
||||
Outputs: ~/aaronai/experiments/consistency_test_v2_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "consistency_test_v2_results.json"
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
MODEL = "mistral"
|
||||
N_PASSES = 3
|
||||
N_DOCS = 50
|
||||
PER_CALL_TIMEOUT = 60 # seconds — fail fast, don't wedge
|
||||
MAX_DOC_CHARS = 8000 # cap document length sent to Mistral
|
||||
|
||||
EXTRACTION_PROMPT = """Extract entities from the document below. Return ONLY valid JSON with this exact schema:
|
||||
{
|
||||
"people": [string],
|
||||
"organizations": [string],
|
||||
"locations": [string],
|
||||
"dates": [string]
|
||||
}
|
||||
Rules:
|
||||
- Only include entities you are CERTAIN about. If uncertain, omit.
|
||||
- No prose, no markdown fences, no commentary. JSON only.
|
||||
- Empty arrays are valid.
|
||||
|
||||
DOCUMENT:
|
||||
"""
|
||||
|
||||
|
||||
def call_mistral(document_text):
|
||||
truncated = document_text[:MAX_DOC_CHARS]
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
OLLAMA_URL,
|
||||
json={
|
||||
"model": MODEL,
|
||||
"prompt": EXTRACTION_PROMPT + truncated,
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
"options": {"num_predict": 512},
|
||||
},
|
||||
timeout=PER_CALL_TIMEOUT,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return {
|
||||
"response": resp.json().get("response", ""),
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"truncated": len(document_text) > MAX_DOC_CHARS,
|
||||
}
|
||||
except requests.exceptions.Timeout:
|
||||
return {"error": f"timeout after {PER_CALL_TIMEOUT}s", "latency_s": PER_CALL_TIMEOUT}
|
||||
except Exception as e:
|
||||
return {"error": str(e), "latency_s": round(time.time() - t0, 2)}
|
||||
|
||||
|
||||
def parse_entities(raw_response):
|
||||
text = (raw_response or "").strip()
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"\s*```$", "", text)
|
||||
try:
|
||||
data = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
out = {}
|
||||
for key in ("people", "organizations", "locations", "dates"):
|
||||
vals = data.get(key, [])
|
||||
if not isinstance(vals, list):
|
||||
return None
|
||||
out[key] = sorted(set(str(v).strip().lower() for v in vals if v))
|
||||
return out
|
||||
|
||||
|
||||
def entities_match(a, b):
|
||||
if a is None or b is None:
|
||||
return False
|
||||
return all(a[k] == b[k] for k in ("people", "organizations", "locations", "dates"))
|
||||
|
||||
|
||||
def fetch_distinct_sources(pg_conn, n):
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, string_agg(document, E'\n\n' ORDER BY id) AS doc
|
||||
FROM embeddings
|
||||
WHERE source IS NOT NULL
|
||||
GROUP BY source
|
||||
ORDER BY MIN(id)
|
||||
LIMIT %s
|
||||
""", (n,))
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
return [(s, d) for s, d in rows if d and len(d.strip()) > 50]
|
||||
|
||||
|
||||
def main():
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not pg_dsn:
|
||||
print("ERROR: PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
docs = fetch_distinct_sources(pg_conn, N_DOCS)
|
||||
pg_conn.close()
|
||||
|
||||
print(f"Loaded {len(docs)} distinct sources from pgvector")
|
||||
print(f"Model: {MODEL} | Passes per doc: {N_PASSES}")
|
||||
print(f"Per-call timeout: {PER_CALL_TIMEOUT}s | Max doc chars: {MAX_DOC_CHARS}")
|
||||
print(f"Calls planned: {len(docs) * N_PASSES}\n")
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, (source, doc_text) in enumerate(docs, 1):
|
||||
size_marker = f"[{len(doc_text):>5}c]"
|
||||
print(f"[{i:02d}/{len(docs)}] {size_marker} {source[:55]}", flush=True)
|
||||
passes = []
|
||||
for p in range(N_PASSES):
|
||||
r = call_mistral(doc_text)
|
||||
if "error" in r:
|
||||
print(f" pass {p+1}: {r['error']}", flush=True)
|
||||
passes.append({"error": r["error"], "parsed_ok": False, "latency_s": r["latency_s"]})
|
||||
else:
|
||||
entities = parse_entities(r["response"])
|
||||
passes.append({
|
||||
"raw": r["response"][:500],
|
||||
"entities": entities,
|
||||
"latency_s": r["latency_s"],
|
||||
"parsed_ok": entities is not None,
|
||||
"truncated_input": r.get("truncated", False),
|
||||
})
|
||||
|
||||
all_parsed = all(p.get("parsed_ok") for p in passes)
|
||||
if all_parsed:
|
||||
e1, e2, e3 = passes[0]["entities"], passes[1]["entities"], passes[2]["entities"]
|
||||
consistent = entities_match(e1, e2) and entities_match(e2, e3)
|
||||
per_field = {
|
||||
k: (e1[k] == e2[k] == e3[k])
|
||||
for k in ("people", "organizations", "locations", "dates")
|
||||
}
|
||||
else:
|
||||
consistent = False
|
||||
per_field = None
|
||||
|
||||
latencies = [p.get("latency_s", 0) for p in passes]
|
||||
print(f" parsed={all_parsed} consistent={consistent} latencies={latencies}", flush=True)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"doc_chars": len(doc_text),
|
||||
"passes": passes,
|
||||
"all_parsed": all_parsed,
|
||||
"consistent": consistent,
|
||||
"per_field_consistency": per_field,
|
||||
})
|
||||
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
parsed = [r for r in results if r["all_parsed"]]
|
||||
consistent = [r for r in parsed if r["consistent"]]
|
||||
|
||||
field_rates = {k: 0 for k in ("people", "organizations", "locations", "dates")}
|
||||
for r in parsed:
|
||||
for k, v in (r["per_field_consistency"] or {}).items():
|
||||
if v:
|
||||
field_rates[k] += 1
|
||||
field_rates_pct = {
|
||||
k: round(100 * v / len(parsed), 1) if parsed else 0.0
|
||||
for k, v in field_rates.items()
|
||||
}
|
||||
|
||||
summary = {
|
||||
"experiment": "003",
|
||||
"title": "Entity-Only Consistency Test",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"model": MODEL,
|
||||
"n_passes": N_PASSES,
|
||||
"per_call_timeout_s": PER_CALL_TIMEOUT,
|
||||
"max_doc_chars": MAX_DOC_CHARS,
|
||||
"n_documents": len(docs),
|
||||
"n_all_parsed": len(parsed),
|
||||
"n_fully_consistent": len(consistent),
|
||||
"consistency_rate_pct": round(100 * len(consistent) / len(docs), 2) if docs else 0.0,
|
||||
"consistency_rate_among_parsed_pct": (
|
||||
round(100 * len(consistent) / len(parsed), 2) if parsed else 0.0
|
||||
),
|
||||
"per_field_consistency_pct": field_rates_pct,
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"exp_001_baseline_pct": 18.0,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(docs)} docs in {total_elapsed}s")
|
||||
print(f"All 3 passes parsed cleanly: {len(parsed)}/{len(docs)}")
|
||||
print(f"Fully consistent (all 4 fields match): {len(consistent)}/{len(docs)} ({summary['consistency_rate_pct']}%)")
|
||||
print(f"Among parsed only: {summary['consistency_rate_among_parsed_pct']}%")
|
||||
print(f"Per-field consistency: {field_rates_pct}")
|
||||
print(f"Exp 001 baseline: 18% | delta: {summary['consistency_rate_pct'] - 18.0:+.2f} pts")
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,179 @@
|
||||
"""
|
||||
Measure actual Graphiti BULK episode cost on a stratified sample.
|
||||
Uses /episodes/bulk endpoint. Submits in small batches to avoid rate limits.
|
||||
"""
|
||||
import json, os, random, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SAMPLE_SIZE = 50
|
||||
BATCH_SIZE = 5
|
||||
RANDOM_SEED = 42
|
||||
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_stratified_sample():
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings
|
||||
GROUP BY source
|
||||
""")
|
||||
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
|
||||
cur.close(); conn.close()
|
||||
|
||||
random.seed(RANDOM_SEED)
|
||||
short = [(s, d) for s, d in sources if len(d) < 1000]
|
||||
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
|
||||
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
|
||||
|
||||
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
|
||||
sample = (
|
||||
random.sample(short, min(15, len(short))) +
|
||||
random.sample(medium, min(25, len(medium))) +
|
||||
random.sample(long_, min(10, len(long_)))
|
||||
)
|
||||
print(f"Sample: {len(sample)} sources, batch_size={BATCH_SIZE}")
|
||||
return sample
|
||||
|
||||
|
||||
def submit_bulk_batch(batch):
|
||||
payload = {
|
||||
"episodes": [
|
||||
{
|
||||
"name": source,
|
||||
"content": doc[:12000],
|
||||
"source_description": "pgvector_migration_bulk_test",
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}
|
||||
for source, doc in batch
|
||||
]
|
||||
}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
|
||||
elapsed = time.time() - t0
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(elapsed, 2),
|
||||
"elapsed_per_episode_s": round(elapsed / len(batch), 2),
|
||||
"response": r.json() if r.ok else None,
|
||||
"error": None if r.ok else r.text[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"elapsed_per_episode_s": None,
|
||||
"response": None,
|
||||
"error": str(e)[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Graphiti BULK Migration Cost Test (Haiku 4.5)")
|
||||
print("=" * 60)
|
||||
print()
|
||||
print("BEFORE running:")
|
||||
print(" 1. Open https://console.anthropic.com/settings/usage")
|
||||
print(" 2. Note current spend.")
|
||||
print()
|
||||
input("Press Enter when noted... ")
|
||||
print()
|
||||
|
||||
sample = fetch_stratified_sample()
|
||||
if not sample:
|
||||
print("ERROR: empty sample"); return
|
||||
|
||||
batches = [sample[i:i+BATCH_SIZE] for i in range(0, len(sample), BATCH_SIZE)]
|
||||
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE} episodes")
|
||||
print()
|
||||
|
||||
results = []
|
||||
total_start = time.time()
|
||||
for i, batch in enumerate(batches, start=1):
|
||||
avg_chars = int(sum(len(d) for _, d in batch) / len(batch))
|
||||
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg_chars:6d}",
|
||||
end=" ", flush=True)
|
||||
result = submit_bulk_batch(batch)
|
||||
results.append(result)
|
||||
if result["error"]:
|
||||
print(f" ERROR: {result['error'][:80]}")
|
||||
if "429" in (result["error"] or "") or "rate" in (result["error"] or "").lower():
|
||||
print(" Rate limited - pausing 30s before next batch")
|
||||
time.sleep(30)
|
||||
else:
|
||||
print(f" {result['status_code']} {result['elapsed_s']}s "
|
||||
f"({result['elapsed_per_episode_s']}s/episode)")
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
successful_batches = [r for r in results if r["error"] is None]
|
||||
failed_batches = [r for r in results if r["error"] is not None]
|
||||
successful_episodes = sum(r["batch_size"] for r in successful_batches)
|
||||
failed_episodes = sum(r["batch_size"] for r in failed_batches)
|
||||
|
||||
summary = {
|
||||
"sample_size": len(sample),
|
||||
"batch_size": BATCH_SIZE,
|
||||
"n_batches": len(batches),
|
||||
"successful_batches": len(successful_batches),
|
||||
"failed_batches": len(failed_batches),
|
||||
"successful_episodes": successful_episodes,
|
||||
"failed_episodes": failed_episodes,
|
||||
"total_elapsed_s": round(total_elapsed, 1),
|
||||
"mean_elapsed_per_episode_s": round(
|
||||
sum(r["elapsed_s"] for r in successful_batches) /
|
||||
max(successful_episodes, 1), 2
|
||||
) if successful_episodes else None,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
|
||||
total_sources = cur.fetchone()[0]
|
||||
cur.close(); conn.close()
|
||||
|
||||
summary["total_corpus_sources"] = total_sources
|
||||
if summary["mean_elapsed_per_episode_s"]:
|
||||
summary["estimated_migration_hours"] = round(
|
||||
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
|
||||
)
|
||||
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Episodes: {summary['successful_episodes']}/{summary['sample_size']} succeeded")
|
||||
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
|
||||
print(f"Total elapsed: {summary['total_elapsed_s']}s")
|
||||
if summary["mean_elapsed_per_episode_s"]:
|
||||
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
|
||||
print(f"Total corpus sources: {summary['total_corpus_sources']}")
|
||||
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
|
||||
print()
|
||||
print(f"AFTER:")
|
||||
print(f" Wait 5 min; note new Anthropic spend; subtract from $28.61 baseline.")
|
||||
print(f" delta / {summary['successful_episodes']} = per-episode cost")
|
||||
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
|
||||
print()
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Retest just the previously-failed batches after raising MAX_QUEUED_QUERIES.
|
||||
Reads failed sources from graphiti_bulk_cost_test.json and resubmits.
|
||||
"""
|
||||
import json, os, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
BATCH_SIZE = 5
|
||||
|
||||
PRIOR_RESULTS = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_cost_test.json"
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
|
||||
|
||||
|
||||
def fetch_doc_for_source(cur, source):
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id)
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def submit_bulk_batch(batch):
|
||||
payload = {"episodes": [
|
||||
{"name": s, "content": d[:12000],
|
||||
"source_description": "pgvector_migration_bulk_retry",
|
||||
"timestamp": "2026-04-28T00:00:00"}
|
||||
for s, d in batch
|
||||
]}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"elapsed_per_episode_s": round((time.time() - t0) / len(batch), 2),
|
||||
"error": None if r.ok else r.text[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"elapsed_per_episode_s": None,
|
||||
"error": str(e)[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
prior = json.loads(PRIOR_RESULTS.read_text())
|
||||
failed_sources = []
|
||||
for batch_result in prior["results"]:
|
||||
if batch_result["error"] is not None:
|
||||
failed_sources.extend(batch_result["sources"])
|
||||
print(f"Retrying {len(failed_sources)} previously-failed sources")
|
||||
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
sources_with_docs = []
|
||||
for s in failed_sources:
|
||||
doc = fetch_doc_for_source(cur, s)
|
||||
if doc:
|
||||
sources_with_docs.append((s, doc))
|
||||
else:
|
||||
print(f" WARN: could not find doc for source {s}")
|
||||
cur.close(); conn.close()
|
||||
print(f"Loaded {len(sources_with_docs)} source docs")
|
||||
print()
|
||||
|
||||
batches = [sources_with_docs[i:i+BATCH_SIZE]
|
||||
for i in range(0, len(sources_with_docs), BATCH_SIZE)]
|
||||
|
||||
results = []
|
||||
total_start = time.time()
|
||||
for i, batch in enumerate(batches, start=1):
|
||||
avg = int(sum(len(d) for _, d in batch) / len(batch))
|
||||
print(f"[batch {i:2d}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}",
|
||||
end=" ", flush=True)
|
||||
result = submit_bulk_batch(batch)
|
||||
results.append(result)
|
||||
if result["error"]:
|
||||
print(f" ERROR: {result['error'][:80]}")
|
||||
else:
|
||||
print(f" {result['status_code']} {result['elapsed_s']}s")
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
successful = [r for r in results if r["error"] is None]
|
||||
failed = [r for r in results if r["error"] is not None]
|
||||
summary = {
|
||||
"n_retry_sources": len(sources_with_docs),
|
||||
"n_batches": len(batches),
|
||||
"successful_batches": len(successful),
|
||||
"failed_batches": len(failed),
|
||||
"successful_episodes": sum(r["batch_size"] for r in successful),
|
||||
"failed_episodes": sum(r["batch_size"] for r in failed),
|
||||
"total_elapsed_s": round(total_elapsed, 1),
|
||||
"results": results,
|
||||
}
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("RETRY RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Episodes: {summary['successful_episodes']}/{len(sources_with_docs)} succeeded")
|
||||
print(f"Batches: {summary['successful_batches']}/{summary['n_batches']} succeeded")
|
||||
print(f"Total elapsed: {summary['total_elapsed_s']}s")
|
||||
print()
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Retry attempt #2 — for sources that timed out after MAX_QUEUED_QUERIES bump."""
|
||||
import json, os, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
BATCH_SIZE = 3 # smaller batches given timeouts
|
||||
|
||||
PRIOR = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry.json"
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_bulk_retry2.json"
|
||||
|
||||
|
||||
def fetch_doc(cur, source):
|
||||
cur.execute("SELECT STRING_AGG(document, E'\\n\\n' ORDER BY id) FROM embeddings WHERE source = %s", (source,))
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def submit_batch(batch):
|
||||
payload = {"episodes": [
|
||||
{"name": s, "content": d[:12000],
|
||||
"source_description": "pgvector_migration_bulk_retry2",
|
||||
"timestamp": "2026-04-28T00:00:00"}
|
||||
for s, d in batch
|
||||
]}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes/bulk", json=payload, timeout=900)
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": None if r.ok else r.text[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"batch_size": len(batch),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": str(e)[:500],
|
||||
"sources": [s for s, _ in batch],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
prior = json.loads(PRIOR.read_text())
|
||||
failed = []
|
||||
for r in prior["results"]:
|
||||
if r["error"] is not None:
|
||||
failed.extend(r["sources"])
|
||||
print(f"Retry #2: {len(failed)} sources still failing")
|
||||
|
||||
conn = psycopg2.connect(PG_DSN); cur = conn.cursor()
|
||||
sources = []
|
||||
for s in failed:
|
||||
d = fetch_doc(cur, s)
|
||||
if d: sources.append((s, d))
|
||||
cur.close(); conn.close()
|
||||
|
||||
batches = [sources[i:i+BATCH_SIZE] for i in range(0, len(sources), BATCH_SIZE)]
|
||||
print(f"Submitting {len(batches)} batches of up to {BATCH_SIZE}\n")
|
||||
|
||||
results = []
|
||||
for i, batch in enumerate(batches, 1):
|
||||
avg = int(sum(len(d) for _, d in batch) / len(batch))
|
||||
print(f"[batch {i}/{len(batches)}] n={len(batch)} avg_chars={avg:6d}", end=" ", flush=True)
|
||||
r = submit_batch(batch)
|
||||
results.append(r)
|
||||
if r["error"]: print(f" ERROR: {r['error'][:80]}")
|
||||
else: print(f" {r['status_code']} {r['elapsed_s']}s")
|
||||
|
||||
succ = [r for r in results if r["error"] is None]
|
||||
fail = [r for r in results if r["error"] is not None]
|
||||
summary = {
|
||||
"n_sources": len(sources),
|
||||
"successful_batches": len(succ),
|
||||
"failed_batches": len(fail),
|
||||
"successful_episodes": sum(r["batch_size"] for r in succ),
|
||||
"failed_episodes": sum(r["batch_size"] for r in fail),
|
||||
"results": results,
|
||||
}
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
print()
|
||||
print(f"Episodes: {summary['successful_episodes']}/{len(sources)} succeeded")
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Measure actual Graphiti episode-add cost on a stratified sample of pgvector sources.
|
||||
"""
|
||||
import json, os, random, time
|
||||
from pathlib import Path
|
||||
import psycopg2, requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SAMPLE_SIZE = 50
|
||||
RANDOM_SEED = 42
|
||||
|
||||
OUT = Path.home() / "aaronai" / "experiments" / "graphiti_cost_test.json"
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def fetch_stratified_sample():
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT source, STRING_AGG(document, E'\\n\\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings
|
||||
GROUP BY source
|
||||
""")
|
||||
sources = [(s, doc) for s, doc in cur.fetchall() if doc]
|
||||
cur.close(); conn.close()
|
||||
|
||||
random.seed(RANDOM_SEED)
|
||||
short = [(s, d) for s, d in sources if len(d) < 1000]
|
||||
medium = [(s, d) for s, d in sources if 1000 <= len(d) < 5000]
|
||||
long_ = [(s, d) for s, d in sources if len(d) >= 5000]
|
||||
|
||||
print(f"Pool: short={len(short)} medium={len(medium)} long={len(long_)}")
|
||||
sample = (
|
||||
random.sample(short, min(15, len(short))) +
|
||||
random.sample(medium, min(25, len(medium))) +
|
||||
random.sample(long_, min(10, len(long_)))
|
||||
)
|
||||
print(f"Sample: {len(sample)} sources")
|
||||
return sample
|
||||
|
||||
|
||||
def submit_episode(source: str, document: str) -> dict:
|
||||
payload = {
|
||||
"name": source,
|
||||
"content": document[:12000],
|
||||
"source_description": "pgvector_migration_cost_test",
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = requests.post(f"{GRAPHITI_URL}/episodes", json=payload, timeout=600)
|
||||
return {
|
||||
"source": source,
|
||||
"doc_chars": len(document),
|
||||
"doc_chars_sent": min(len(document), 12000),
|
||||
"status_code": r.status_code,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": None if r.ok else r.text[:500],
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"source": source,
|
||||
"doc_chars": len(document),
|
||||
"doc_chars_sent": min(len(document), 12000),
|
||||
"status_code": None,
|
||||
"elapsed_s": round(time.time() - t0, 2),
|
||||
"error": str(e)[:500],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Graphiti Migration Cost Test (Haiku 4.5)")
|
||||
print("=" * 60)
|
||||
print()
|
||||
print("BEFORE running:")
|
||||
print(" 1. Open https://console.anthropic.com/settings/usage")
|
||||
print(" 2. Note current spend.")
|
||||
print()
|
||||
input("Press Enter when noted... ")
|
||||
print()
|
||||
|
||||
sample = fetch_stratified_sample()
|
||||
if not sample:
|
||||
print("ERROR: empty sample"); return
|
||||
|
||||
# Smoke test
|
||||
print(f"Smoke test on first source ({sample[0][0][:50]}...):")
|
||||
smoke = submit_episode(*sample[0])
|
||||
print(f" status={smoke['status_code']} elapsed={smoke['elapsed_s']}s")
|
||||
if smoke["error"]:
|
||||
print(f" ERROR: {smoke['error']}")
|
||||
OUT.write_text(json.dumps({"smoke_test": smoke}, indent=2))
|
||||
print("Halted — fix smoke test before bulk run.")
|
||||
return
|
||||
print(f" OK. Proceeding with {len(sample)} sources.")
|
||||
print()
|
||||
|
||||
results = [smoke]
|
||||
total_start = time.time()
|
||||
for i, (source, doc) in enumerate(sample[1:], start=2):
|
||||
bucket = "short" if len(doc) < 1000 else "medium" if len(doc) < 5000 else "long"
|
||||
print(f"[{i:2d}/{len(sample)}] [{bucket:6s}] [{len(doc):6d}c] {source[:50]:50s}", end=" ", flush=True)
|
||||
result = submit_episode(source, doc)
|
||||
results.append(result)
|
||||
if result["error"]:
|
||||
print(f" ERROR: {result['error'][:80]}")
|
||||
else:
|
||||
print(f" {result['status_code']} {result['elapsed_s']}s")
|
||||
total_elapsed = time.time() - total_start
|
||||
|
||||
successful = [r for r in results if r["error"] is None]
|
||||
failed = [r for r in results if r["error"] is not None]
|
||||
|
||||
summary = {
|
||||
"sample_size": len(sample),
|
||||
"successful": len(successful),
|
||||
"failed": len(failed),
|
||||
"total_elapsed_s": round(total_elapsed, 1),
|
||||
"mean_elapsed_per_episode_s": round(
|
||||
sum(r["elapsed_s"] for r in successful) / max(len(successful), 1), 2
|
||||
),
|
||||
"by_bucket": {},
|
||||
"results": results,
|
||||
}
|
||||
|
||||
for bname, lo, hi in [("short", 0, 1000), ("medium", 1000, 5000), ("long", 5000, 10**9)]:
|
||||
b = [r for r in successful if lo <= r["doc_chars"] < hi]
|
||||
if b:
|
||||
summary["by_bucket"][bname] = {
|
||||
"n": len(b),
|
||||
"mean_elapsed_s": round(sum(r["elapsed_s"] for r in b) / len(b), 2),
|
||||
"mean_chars": int(sum(r["doc_chars"] for r in b) / len(b)),
|
||||
}
|
||||
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(DISTINCT source) FROM embeddings")
|
||||
total_sources = cur.fetchone()[0]
|
||||
cur.close(); conn.close()
|
||||
|
||||
summary["total_corpus_sources"] = total_sources
|
||||
summary["estimated_migration_hours"] = round(
|
||||
total_sources * summary["mean_elapsed_per_episode_s"] / 3600, 1
|
||||
)
|
||||
|
||||
OUT.write_text(json.dumps(summary, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("RESULTS")
|
||||
print("=" * 60)
|
||||
print(f"Sample: {summary['successful']}/{summary['sample_size']} succeeded, {summary['failed']} failed")
|
||||
print(f"Total elapsed: {summary['total_elapsed_s']}s")
|
||||
print(f"Mean per episode: {summary['mean_elapsed_per_episode_s']}s")
|
||||
for bucket, stats in summary["by_bucket"].items():
|
||||
print(f" {bucket:6s} n={stats['n']:3d} chars~{stats['mean_chars']:6d} elapsed~{stats['mean_elapsed_s']}s")
|
||||
print()
|
||||
print(f"Total corpus sources: {summary['total_corpus_sources']}")
|
||||
print(f"Estimated migration runtime: {summary['estimated_migration_hours']} hours")
|
||||
print()
|
||||
print("AFTER:")
|
||||
print(" Wait 5 min; note new Anthropic spend; subtract.")
|
||||
print(f" test_cost / {summary['successful']} = per-episode cost")
|
||||
print(f" per-episode * {summary['total_corpus_sources']} = full migration estimate")
|
||||
print()
|
||||
print(f"Full results: {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
E1.4 per-source predicate diversity comparison — fixed version.
|
||||
Looks up episode uuids by name in both production and cascade graphs.
|
||||
"""
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from falkordb import FalkorDB
|
||||
|
||||
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
|
||||
PRODUCTION_GROUP = "aaron"
|
||||
CASCADE_GROUP = "aaron_cascade_e14"
|
||||
|
||||
def get_predicates_for_episode(graph, episode_uuid):
|
||||
query = """
|
||||
MATCH ()-[r:RELATES_TO]->()
|
||||
WHERE $uuid IN r.episodes
|
||||
RETURN count(DISTINCT r.name) AS predicate_count
|
||||
"""
|
||||
result = graph.query(query, {"uuid": episode_uuid})
|
||||
rows = result.result_set
|
||||
return rows[0][0] if rows else 0
|
||||
|
||||
def get_edge_count_for_episode(graph, episode_uuid):
|
||||
query = """
|
||||
MATCH ()-[r:RELATES_TO]->()
|
||||
WHERE $uuid IN r.episodes
|
||||
RETURN count(r) AS edge_count
|
||||
"""
|
||||
result = graph.query(query, {"uuid": episode_uuid})
|
||||
rows = result.result_set
|
||||
return rows[0][0] if rows else 0
|
||||
|
||||
def find_episode_uuid(graph, source_name):
|
||||
query = """
|
||||
MATCH (e:Episodic {name: $name})
|
||||
RETURN e.uuid AS uuid
|
||||
LIMIT 1
|
||||
"""
|
||||
result = graph.query(query, {"name": source_name})
|
||||
rows = result.result_set
|
||||
return rows[0][0] if rows else None
|
||||
|
||||
def main():
|
||||
db = FalkorDB(host='localhost', port=6379)
|
||||
prod_graph = db.select_graph(PRODUCTION_GROUP)
|
||||
cascade_graph = db.select_graph(CASCADE_GROUP)
|
||||
|
||||
with open(E14_RESULTS) as f:
|
||||
e14 = json.load(f)
|
||||
|
||||
sources = [r for r in e14['results'] if 'submit_result' in r]
|
||||
print(f"Analyzing {len(sources)} sources...")
|
||||
print()
|
||||
|
||||
comparisons = []
|
||||
missing_prod = 0
|
||||
missing_cascade = 0
|
||||
for src in sources:
|
||||
name = src['name']
|
||||
bucket = src['bucket']
|
||||
|
||||
prod_uuid = find_episode_uuid(prod_graph, name)
|
||||
cascade_uuid = find_episode_uuid(cascade_graph, name)
|
||||
|
||||
if not prod_uuid:
|
||||
missing_prod += 1
|
||||
print(f" WARN: missing in production: {name}")
|
||||
continue
|
||||
if not cascade_uuid:
|
||||
missing_cascade += 1
|
||||
print(f" WARN: missing in cascade: {name}")
|
||||
continue
|
||||
|
||||
prod_preds = get_predicates_for_episode(prod_graph, prod_uuid)
|
||||
cascade_preds = get_predicates_for_episode(cascade_graph, cascade_uuid)
|
||||
prod_edges = get_edge_count_for_episode(prod_graph, prod_uuid)
|
||||
cascade_edges = get_edge_count_for_episode(cascade_graph, cascade_uuid)
|
||||
|
||||
comparisons.append({
|
||||
"name": name,
|
||||
"bucket": bucket,
|
||||
"prod_preds": prod_preds,
|
||||
"cascade_preds": cascade_preds,
|
||||
"delta_preds": cascade_preds - prod_preds,
|
||||
"prod_edges": prod_edges,
|
||||
"cascade_edges": cascade_edges,
|
||||
"delta_edges": cascade_edges - prod_edges,
|
||||
})
|
||||
|
||||
if missing_prod or missing_cascade:
|
||||
print()
|
||||
print(f"Missing: {missing_prod} prod, {missing_cascade} cascade")
|
||||
print()
|
||||
|
||||
if not comparisons:
|
||||
print("No comparable sources found. Aborting.")
|
||||
return
|
||||
|
||||
# Per-source detail
|
||||
print(f"{'Bucket':<10} {'Source':<58} {'Preds A→B':<14} {'Δ':<6} {'Edges A→B':<14} {'Δ'}")
|
||||
print("-" * 115)
|
||||
for c in sorted(comparisons, key=lambda x: (x['bucket'], x['name'])):
|
||||
name_short = (c['name'][:55] + '..') if len(c['name']) > 58 else c['name']
|
||||
preds_str = f"{c['prod_preds']}→{c['cascade_preds']}"
|
||||
edges_str = f"{c['prod_edges']}→{c['cascade_edges']}"
|
||||
print(f"{c['bucket']:<10} {name_short:<58} {preds_str:<14} {c['delta_preds']:+d} {edges_str:<14} {c['delta_edges']:+d}")
|
||||
|
||||
# Per-bucket aggregation
|
||||
print()
|
||||
print("=" * 115)
|
||||
print("PER-BUCKET AGGREGATION")
|
||||
print("=" * 115)
|
||||
by_bucket = defaultdict(list)
|
||||
for c in comparisons:
|
||||
by_bucket[c['bucket']].append(c)
|
||||
|
||||
for bucket in ['high', 'mid', 'low', 'document']:
|
||||
items = by_bucket.get(bucket, [])
|
||||
if not items:
|
||||
continue
|
||||
n = len(items)
|
||||
sum_pp = sum(c['prod_preds'] for c in items)
|
||||
sum_cp = sum(c['cascade_preds'] for c in items)
|
||||
sum_pe = sum(c['prod_edges'] for c in items)
|
||||
sum_ce = sum(c['cascade_edges'] for c in items)
|
||||
positive = sum(1 for c in items if c['delta_preds'] > 0)
|
||||
negative = sum(1 for c in items if c['delta_preds'] < 0)
|
||||
flat = sum(1 for c in items if c['delta_preds'] == 0)
|
||||
pct_pred = ((sum_cp - sum_pp) / sum_pp * 100) if sum_pp else 0
|
||||
pct_edge = ((sum_ce - sum_pe) / sum_pe * 100) if sum_pe else 0
|
||||
print(f"\n{bucket.upper()} (n={n}):")
|
||||
print(f" Predicates: {sum_pp} → {sum_cp} ({pct_pred:+.1f}%)")
|
||||
print(f" Edges: {sum_pe} → {sum_ce} ({pct_edge:+.1f}%)")
|
||||
print(f" Outcomes: {positive} positive, {flat} flat, {negative} negative")
|
||||
|
||||
# Aggregate
|
||||
print()
|
||||
print("=" * 115)
|
||||
print(f"AGGREGATE (n={len(comparisons)})")
|
||||
print("=" * 115)
|
||||
total_pp = sum(c['prod_preds'] for c in comparisons)
|
||||
total_cp = sum(c['cascade_preds'] for c in comparisons)
|
||||
total_pe = sum(c['prod_edges'] for c in comparisons)
|
||||
total_ce = sum(c['cascade_edges'] for c in comparisons)
|
||||
print(f" Predicates: {total_pp} → {total_cp} ({(total_cp-total_pp)/total_pp*100:+.1f}%)")
|
||||
print(f" Edges: {total_pe} → {total_ce} ({(total_ce-total_pe)/total_pe*100:+.1f}%)")
|
||||
|
||||
out_path = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(comparisons, f, indent=2)
|
||||
print()
|
||||
print(f"Saved to {out_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1.4 orchestration — cascade re-extraction at n=30, group_id=aaron_cascade_e14."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "e14_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "e14_cascade_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_e14"
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text, max_retries=2):
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
last_err = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=300,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e:
|
||||
last_err = e
|
||||
if attempt < max_retries - 1:
|
||||
print(f" (retry {attempt+1} after {type(e).__name__})", end=" ", flush=True)
|
||||
time.sleep(5)
|
||||
continue
|
||||
return {"error": f"After {max_retries} retries: {last_err}"}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
if "error" in metadata:
|
||||
return None
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode_singular(name, content, custom_instructions):
|
||||
payload = {
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": "e14_replication_run",
|
||||
"timestamp": "2026-04-29T00:00:00",
|
||||
"group_id": TEST_GROUP_ID,
|
||||
"custom_extraction_instructions": custom_instructions,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def load_state():
|
||||
if RESULTS_FILE.exists():
|
||||
with open(RESULTS_FILE) as f:
|
||||
data = json.load(f)
|
||||
return data.get("results", []), {r["name"] for r in data.get("results", []) if "submit_result" in r}
|
||||
return [], set()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
|
||||
results, completed = load_state()
|
||||
if completed:
|
||||
print(f"Resuming — {len(completed)} sources already completed, {len(selected) - len(completed)} remaining\n")
|
||||
else:
|
||||
print(f"E1.4 cascade replication — {len(selected)} episodes to group_id={TEST_GROUP_ID}\n")
|
||||
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
if name in completed:
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name} — SKIP (already completed)")
|
||||
continue
|
||||
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
if ep.get("subtype"):
|
||||
record["subtype"] = ep["subtype"]
|
||||
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
custom_instructions = format_metadata_as_orientation(metadata)
|
||||
record["custom_extraction_instructions"] = custom_instructions
|
||||
print(f" Submitting via /episodes...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode_singular(name, text, custom_instructions)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1.4 sample selection — n=30 stratified, excluding E1's 10 sources."""
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
E1_SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
OUTPUT = EXPERIMENTS / "e14_sample.json"
|
||||
|
||||
TARGETS = {"high": 8, "mid": 8, "low": 8, "document": 6}
|
||||
|
||||
def query_episode_counts():
|
||||
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
|
||||
"RETURN e.name AS name, count(distinct n) AS entities "
|
||||
"ORDER BY entities DESC")
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
lines = [l for l in result.stdout.split("\n") if l.strip()]
|
||||
episodes = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if lines[i] == "name":
|
||||
i += 2
|
||||
continue
|
||||
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
|
||||
break
|
||||
if i + 1 < len(lines):
|
||||
try:
|
||||
count = int(lines[i + 1])
|
||||
episodes.append({"name": lines[i], "entities": count})
|
||||
i += 2
|
||||
except ValueError:
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
return episodes
|
||||
|
||||
|
||||
def is_document(name):
|
||||
return any(name.lower().endswith(ext) for ext in (".pdf", ".docx", ".pptx", ".txt", ".md"))
|
||||
|
||||
|
||||
def doc_subtype(name):
|
||||
"""Categorize document by likely subtype."""
|
||||
s = name.lower()
|
||||
if "syllabus" in s or "ind study" in s or "_is" in s:
|
||||
return "academic"
|
||||
if "annual" in s or "report" in s or "_ar20" in s or "rtpcc" in s or "novo" in s:
|
||||
return "reference"
|
||||
if "cv" in s or "resume" in s or "application" in s or "cover letter" in s:
|
||||
return "reference"
|
||||
if "marquee" in s or "pptx" in s.lower() or "presentation" in s:
|
||||
return "creative"
|
||||
return "other"
|
||||
|
||||
|
||||
def main():
|
||||
print("Fetching episode entity counts from Tier 1 graph...")
|
||||
episodes = query_episode_counts()
|
||||
print(f"Got {len(episodes)} episodes")
|
||||
|
||||
# Load E1's sample to exclude
|
||||
with open(E1_SAMPLE_FILE) as f:
|
||||
e1_sample = json.load(f)
|
||||
e1_names = {ep["name"] for ep in e1_sample["selected"]}
|
||||
print(f"Excluding {len(e1_names)} sources from E1")
|
||||
|
||||
# Quartile boundaries
|
||||
counts = sorted([e["entities"] for e in episodes], reverse=True)
|
||||
n = len(counts)
|
||||
top_q = counts[n // 4]
|
||||
bottom_q = counts[3 * n // 4]
|
||||
print(f"Quartile boundaries: top≥{top_q}, mid={bottom_q+1}-{top_q-1}, low≤{bottom_q}")
|
||||
|
||||
# Filter out E1 and bucket
|
||||
available = [e for e in episodes if e["name"] not in e1_names]
|
||||
|
||||
high = [e for e in available if e["entities"] >= top_q and not is_document(e["name"])]
|
||||
mid = [e for e in available if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
|
||||
low = [e for e in available if e["entities"] <= bottom_q and not is_document(e["name"])]
|
||||
docs = [e for e in available if is_document(e["name"]) and e["entities"] >= 5]
|
||||
|
||||
print(f"\nAvailable after E1 exclusion:")
|
||||
print(f" High-density: {len(high)}")
|
||||
print(f" Mid-density: {len(mid)}")
|
||||
print(f" Low-density: {len(low)}")
|
||||
print(f" Documents: {len(docs)}")
|
||||
|
||||
# For high/mid/low: take from middle of bucket (avoids edge cases)
|
||||
def pick(bucket, n):
|
||||
if len(bucket) < n:
|
||||
print(f" WARNING: only {len(bucket)} available, asked for {n}")
|
||||
return bucket
|
||||
mid_idx = len(bucket) // 2
|
||||
start = max(0, mid_idx - n // 2)
|
||||
return bucket[start:start + n]
|
||||
|
||||
selected = []
|
||||
for ep in pick(high, TARGETS["high"]):
|
||||
ep["bucket"] = "high"
|
||||
selected.append(ep)
|
||||
for ep in pick(mid, TARGETS["mid"]):
|
||||
ep["bucket"] = "mid"
|
||||
selected.append(ep)
|
||||
for ep in pick(low, TARGETS["low"]):
|
||||
ep["bucket"] = "low"
|
||||
selected.append(ep)
|
||||
|
||||
# For documents: stratify by subtype, target 2 academic, 2 creative, 2 reference
|
||||
doc_targets = {"academic": 2, "creative": 2, "reference": 2}
|
||||
docs_by_subtype = {}
|
||||
for ep in docs:
|
||||
st = doc_subtype(ep["name"])
|
||||
ep["subtype"] = st
|
||||
docs_by_subtype.setdefault(st, []).append(ep)
|
||||
print(f"\n Doc subtypes available: {[(k, len(v)) for k, v in docs_by_subtype.items()]}")
|
||||
|
||||
# Pick from middle of each subtype bucket
|
||||
for subtype, target in doc_targets.items():
|
||||
sub_docs = docs_by_subtype.get(subtype, [])
|
||||
picked = pick(sub_docs, target)
|
||||
for ep in picked:
|
||||
ep["bucket"] = "document"
|
||||
selected.append(ep)
|
||||
|
||||
# If we're short on documents (e.g., subtype underrepresented), fill from "other"
|
||||
doc_count = sum(1 for s in selected if s.get("bucket") == "document")
|
||||
if doc_count < TARGETS["document"]:
|
||||
shortage = TARGETS["document"] - doc_count
|
||||
leftover = [e for e in docs if e["name"] not in {s["name"] for s in selected}]
|
||||
for ep in leftover[:shortage]:
|
||||
ep["bucket"] = "document"
|
||||
ep["subtype"] = ep.get("subtype") or doc_subtype(ep["name"])
|
||||
selected.append(ep)
|
||||
|
||||
print(f"\nSelected {len(selected)} episodes for E1.4:")
|
||||
for ep in selected:
|
||||
sub = f"/{ep.get('subtype')}" if ep.get('bucket') == 'document' else ""
|
||||
print(f" [{ep['bucket']}{sub:>10}] {ep['entities']:>3}e {ep['name']}")
|
||||
|
||||
with open(OUTPUT, "w") as f:
|
||||
json.dump({
|
||||
"metadata": {
|
||||
"purpose": "E1.4 cascade re-extraction replication (n=30)",
|
||||
"exclusions": "E1's 10 sources",
|
||||
"stratification": {**TARGETS, "document_subtypes": doc_targets},
|
||||
"quartile_top": top_q,
|
||||
"quartile_bottom": bottom_q,
|
||||
},
|
||||
"selected": selected,
|
||||
}, f, indent=2)
|
||||
print(f"\nSaved to {OUTPUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,246 @@
|
||||
"""
|
||||
E1.6 analysis — correlate domain-purity ratings with cascade outcomes.
|
||||
Applies pre-registered decision rules from E1.6 protocol.
|
||||
"""
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
RATINGS_PATH = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
|
||||
COMPARISON_PATH = "/home/aaron/aaronai/experiments/e14_per_source_comparison.json"
|
||||
|
||||
|
||||
def spearman(xs, ys):
|
||||
"""Compute Spearman rank correlation."""
|
||||
n = len(xs)
|
||||
if n < 2:
|
||||
return None
|
||||
# Rank the values
|
||||
def rank(values):
|
||||
sorted_idx = sorted(range(len(values)), key=lambda i: values[i])
|
||||
ranks = [0] * len(values)
|
||||
i = 0
|
||||
while i < len(values):
|
||||
j = i
|
||||
while j + 1 < len(values) and values[sorted_idx[j+1]] == values[sorted_idx[i]]:
|
||||
j += 1
|
||||
avg_rank = (i + j) / 2 + 1
|
||||
for k in range(i, j + 1):
|
||||
ranks[sorted_idx[k]] = avg_rank
|
||||
i = j + 1
|
||||
return ranks
|
||||
rx = rank(xs)
|
||||
ry = rank(ys)
|
||||
mean_rx = sum(rx) / n
|
||||
mean_ry = sum(ry) / n
|
||||
num = sum((rx[i] - mean_rx) * (ry[i] - mean_ry) for i in range(n))
|
||||
den_x = (sum((rx[i] - mean_rx) ** 2 for i in range(n))) ** 0.5
|
||||
den_y = (sum((ry[i] - mean_ry) ** 2 for i in range(n))) ** 0.5
|
||||
if den_x == 0 or den_y == 0:
|
||||
return None
|
||||
return num / (den_x * den_y)
|
||||
|
||||
|
||||
def main():
|
||||
with open(RATINGS_PATH) as f:
|
||||
ratings_data = json.load(f)
|
||||
with open(COMPARISON_PATH) as f:
|
||||
comparisons = json.load(f)
|
||||
|
||||
ratings_by_name = {r['name']: r for r in ratings_data['ratings']}
|
||||
comp_by_name = {c['name']: c for c in comparisons}
|
||||
|
||||
# Join ratings with cascade outcomes
|
||||
joined = []
|
||||
for name, rating in ratings_by_name.items():
|
||||
if name in comp_by_name:
|
||||
comp = comp_by_name[name]
|
||||
joined.append({
|
||||
'name': name,
|
||||
'binary': rating['binary'],
|
||||
'score': rating['score'],
|
||||
'note': rating.get('note'),
|
||||
'bucket': comp['bucket'],
|
||||
'delta_preds': comp['delta_preds'],
|
||||
'delta_edges': comp['delta_edges'],
|
||||
'prod_preds': comp['prod_preds'],
|
||||
'cascade_preds': comp['cascade_preds'],
|
||||
})
|
||||
|
||||
print("=" * 100)
|
||||
print(f"E1.6 ANALYSIS — Domain Purity vs Cascade Outcome (n={len(joined)})")
|
||||
print("=" * 100)
|
||||
|
||||
# Per-source detail with rating
|
||||
print()
|
||||
print(f"{'Bucket':<10} {'Source':<48} {'Domain':<8} {'Score':<6} {'Δpreds':<8} {'Δedges':<8}")
|
||||
print("-" * 100)
|
||||
for j in sorted(joined, key=lambda x: (x['binary'], -x['score'], x['bucket'], x['name'])):
|
||||
name_short = (j['name'][:45] + '..') if len(j['name']) > 48 else j['name']
|
||||
print(f"{j['bucket']:<10} {name_short:<48} {j['binary']:<8} {j['score']:<6} {j['delta_preds']:+d} {j['delta_edges']:+d}")
|
||||
|
||||
# PRIMARY TEST: binary purity vs cascade outcome distribution
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("PRIMARY TEST: Binary purity vs cascade outcome distribution")
|
||||
print("=" * 100)
|
||||
|
||||
def categorize_outcome(delta):
|
||||
if delta > 0:
|
||||
return 'positive'
|
||||
elif delta < 0:
|
||||
return 'negative'
|
||||
else:
|
||||
return 'flat'
|
||||
|
||||
by_binary = defaultdict(lambda: {'positive': 0, 'flat': 0, 'negative': 0, 'total': 0})
|
||||
for j in joined:
|
||||
outcome = categorize_outcome(j['delta_preds'])
|
||||
by_binary[j['binary']][outcome] += 1
|
||||
by_binary[j['binary']]['total'] += 1
|
||||
|
||||
print()
|
||||
print(f"{'Group':<15} {'n':<5} {'Positive':<12} {'Flat':<10} {'Negative':<12}")
|
||||
print("-" * 60)
|
||||
for binary in ['single', 'multi']:
|
||||
d = by_binary[binary]
|
||||
n = d['total']
|
||||
if n == 0:
|
||||
continue
|
||||
pos_pct = d['positive'] / n * 100
|
||||
flat_pct = d['flat'] / n * 100
|
||||
neg_pct = d['negative'] / n * 100
|
||||
print(f"{binary+'-domain':<15} {n:<5} {d['positive']} ({pos_pct:.0f}%) {d['flat']} ({flat_pct:.0f}%) {d['negative']} ({neg_pct:.0f}%)")
|
||||
|
||||
# Compute the gap
|
||||
if by_binary['single']['total'] > 0 and by_binary['multi']['total'] > 0:
|
||||
single_pos_rate = by_binary['single']['positive'] / by_binary['single']['total'] * 100
|
||||
multi_pos_rate = by_binary['multi']['positive'] / by_binary['multi']['total'] * 100
|
||||
gap = single_pos_rate - multi_pos_rate
|
||||
print()
|
||||
print(f"Cascade-positive rate gap (single - multi): {gap:+.1f} percentage points")
|
||||
print()
|
||||
# Apply pre-registered decision rule
|
||||
if gap >= 20:
|
||||
verdict = "NARROWNESS HYPOTHESIS SUPPORTED"
|
||||
detail = f"Single-domain content is {gap:.0f}pp more likely to gain from cascade than multi-domain."
|
||||
elif gap <= -20:
|
||||
verdict = "REVERSE OF HYPOTHESIS"
|
||||
detail = f"Multi-domain content unexpectedly benefits more (counter to prediction)."
|
||||
elif abs(gap) < 10:
|
||||
verdict = "HYPOTHESIS NOT SUPPORTED"
|
||||
detail = "Domain purity does not appear to predict cascade outcome."
|
||||
else:
|
||||
verdict = "INCONCLUSIVE"
|
||||
detail = f"Gap of {gap:+.0f}pp is suggestive but below the pre-registered 20pp threshold."
|
||||
print(f" Pre-registered decision rule: {verdict}")
|
||||
print(f" {detail}")
|
||||
|
||||
# SECONDARY TEST: Spearman correlation between purity score and predicate delta
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("SECONDARY TEST: Spearman rank correlation (purity score vs predicate delta)")
|
||||
print("=" * 100)
|
||||
|
||||
scores = [j['score'] for j in joined]
|
||||
deltas_pred = [j['delta_preds'] for j in joined]
|
||||
deltas_edge = [j['delta_edges'] for j in joined]
|
||||
|
||||
rho_pred = spearman(scores, deltas_pred)
|
||||
rho_edge = spearman(scores, deltas_edge)
|
||||
|
||||
print()
|
||||
print(f" Spearman ρ (purity score vs Δpredicates): {rho_pred:.3f}")
|
||||
print(f" Spearman ρ (purity score vs Δedges): {rho_edge:.3f}")
|
||||
print()
|
||||
|
||||
if rho_pred is not None:
|
||||
if rho_pred >= 0.4:
|
||||
v = "STRONG POSITIVE — narrowness hypothesis supported with monotonic relationship"
|
||||
elif rho_pred >= 0.2:
|
||||
v = "WEAK POSITIVE — consistent with hypothesis but not strong evidence"
|
||||
elif rho_pred <= -0.2:
|
||||
v = "NEGATIVE — refutes hypothesis"
|
||||
else:
|
||||
v = "NO CORRELATION — hypothesis not supported"
|
||||
print(f" Predicate delta verdict: {v}")
|
||||
print()
|
||||
|
||||
# TERTIARY TEST: within-bucket correlation
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("TERTIARY TEST: Within-bucket correlation")
|
||||
print("=" * 100)
|
||||
|
||||
by_bucket = defaultdict(list)
|
||||
for j in joined:
|
||||
by_bucket[j['bucket']].append(j)
|
||||
|
||||
print()
|
||||
print(f"{'Bucket':<12} {'n':<5} {'Single':<10} {'Multi':<10} {'ρ (score vs Δpred)':<22}")
|
||||
print("-" * 75)
|
||||
for bucket in ['high', 'mid', 'low', 'document']:
|
||||
items = by_bucket.get(bucket, [])
|
||||
if not items:
|
||||
continue
|
||||
n = len(items)
|
||||
n_single = sum(1 for j in items if j['binary'] == 'single')
|
||||
n_multi = sum(1 for j in items if j['binary'] == 'multi')
|
||||
if n >= 3:
|
||||
scores_b = [j['score'] for j in items]
|
||||
deltas_b = [j['delta_preds'] for j in items]
|
||||
rho_b = spearman(scores_b, deltas_b)
|
||||
rho_str = f"{rho_b:+.3f}" if rho_b is not None else "n/a (no variance)"
|
||||
else:
|
||||
rho_str = "n/a (too few)"
|
||||
print(f"{bucket:<12} {n:<5} {n_single:<10} {n_multi:<10} {rho_str}")
|
||||
|
||||
# Interaction with bucket: do single/multi outcomes differ within bucket?
|
||||
print()
|
||||
print("Per-bucket cascade-positive rate by binary purity:")
|
||||
print()
|
||||
print(f"{'Bucket':<12} {'Single':<25} {'Multi':<25}")
|
||||
print("-" * 65)
|
||||
for bucket in ['high', 'mid', 'low', 'document']:
|
||||
items = by_bucket.get(bucket, [])
|
||||
if not items:
|
||||
continue
|
||||
single_items = [j for j in items if j['binary'] == 'single']
|
||||
multi_items = [j for j in items if j['binary'] == 'multi']
|
||||
def rate_str(group):
|
||||
if not group:
|
||||
return "—"
|
||||
pos = sum(1 for j in group if j['delta_preds'] > 0)
|
||||
return f"{pos}/{len(group)} positive ({pos/len(group)*100:.0f}%)"
|
||||
print(f"{bucket:<12} {rate_str(single_items):<25} {rate_str(multi_items):<25}")
|
||||
|
||||
# MEAN DELTA by binary group
|
||||
print()
|
||||
print("=" * 100)
|
||||
print("MEAN PREDICATE DELTA BY GROUP")
|
||||
print("=" * 100)
|
||||
print()
|
||||
for binary in ['single', 'multi']:
|
||||
items = [j for j in joined if j['binary'] == binary]
|
||||
if not items:
|
||||
continue
|
||||
n = len(items)
|
||||
mean_dp = sum(j['delta_preds'] for j in items) / n
|
||||
mean_de = sum(j['delta_edges'] for j in items) / n
|
||||
sum_pp = sum(j['prod_preds'] for j in items)
|
||||
sum_cp = sum(j['cascade_preds'] for j in items)
|
||||
pct_change = (sum_cp - sum_pp) / sum_pp * 100 if sum_pp else 0
|
||||
print(f"{binary}-domain (n={n}):")
|
||||
print(f" Mean Δpredicates per source: {mean_dp:+.2f}")
|
||||
print(f" Mean Δedges per source: {mean_de:+.2f}")
|
||||
print(f" Aggregate predicate change: {sum_pp} → {sum_cp} ({pct_change:+.1f}%)")
|
||||
print()
|
||||
|
||||
# Save joined data for the experiments log writeup
|
||||
out_path = "/home/aaron/aaronai/experiments/e16_joined_analysis.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(joined, f, indent=2)
|
||||
print(f"Joined data saved to {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
E1.6 domain-purity rating interface — with full metadata context.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
|
||||
E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json"
|
||||
RATINGS_OUT = "/home/aaron/aaronai/experiments/e16_purity_ratings.json"
|
||||
|
||||
INTRO = """
|
||||
================================================================================
|
||||
E1.6 — DOMAIN-PURITY RATING
|
||||
================================================================================
|
||||
|
||||
Two ratings per source:
|
||||
|
||||
1. BINARY — single-domain (s) or multi-domain (m)?
|
||||
|
||||
Mental test: "If Mistral had to pick ONE domain class for this source,
|
||||
would picking just one significantly UNDER-DESCRIBE the content?"
|
||||
|
||||
YES → MULTI-DOMAIN (m) — content lives across two+ frames meaningfully
|
||||
NO → SINGLE-DOMAIN (s) — content fits cleanly within one frame
|
||||
|
||||
2. SCORE (1-5) — how cleanly does it fit?
|
||||
|
||||
5 = unambiguously one domain
|
||||
4 = primarily one domain, slight other element
|
||||
3 = balanced two-domain
|
||||
2 = primarily two-domain with traces of a third
|
||||
1 = three or more domain frames weighted significantly
|
||||
|
||||
Single binary usually = score 4-5
|
||||
Multi binary usually = score 1-3
|
||||
|
||||
You see for each source: name, length, AND the full Mistral metadata block
|
||||
(domain_class, primary_format, structural_signals, content_signals, summary).
|
||||
|
||||
Blind to: bucket assignment, cascade outcome.
|
||||
|
||||
Commands at any prompt: 's', 'm', 'skip', 'quit'
|
||||
================================================================================
|
||||
""".strip()
|
||||
|
||||
|
||||
def load_existing():
|
||||
if os.path.exists(RATINGS_OUT):
|
||||
with open(RATINGS_OUT) as f:
|
||||
return json.load(f)
|
||||
return {"ratings": [], "completed_names": []}
|
||||
|
||||
def save(data):
|
||||
with open(RATINGS_OUT, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
def render_metadata(metadata):
|
||||
"""Pretty-print the full Mistral metadata block."""
|
||||
if not isinstance(metadata, dict):
|
||||
print(" (metadata unavailable)")
|
||||
return
|
||||
if 'error' in metadata:
|
||||
print(f" (metadata error: {metadata['error']})")
|
||||
return
|
||||
|
||||
# Render fields in a stable order
|
||||
field_order = [
|
||||
'domain_class',
|
||||
'primary_format',
|
||||
'structural_signals',
|
||||
'content_signals',
|
||||
'summary',
|
||||
]
|
||||
for field in field_order:
|
||||
if field in metadata:
|
||||
value = metadata[field]
|
||||
label = field.replace('_', ' ').title()
|
||||
if isinstance(value, list):
|
||||
if value:
|
||||
print(f" {label}:")
|
||||
for item in value:
|
||||
print(f" - {item}")
|
||||
else:
|
||||
print(f" {label}: (none)")
|
||||
elif isinstance(value, str):
|
||||
# Wrap long strings
|
||||
if len(value) > 70:
|
||||
print(f" {label}:")
|
||||
print(f" {value}")
|
||||
else:
|
||||
print(f" {label}: {value}")
|
||||
else:
|
||||
print(f" {label}: {value}")
|
||||
|
||||
# Show any other fields not in the standard order
|
||||
other_fields = [k for k in metadata.keys() if k not in field_order and k != 'char_length']
|
||||
for field in other_fields:
|
||||
value = metadata[field]
|
||||
label = field.replace('_', ' ').title()
|
||||
print(f" {label}: {value}")
|
||||
|
||||
def render_source(src, idx, total):
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f" Source {idx}/{total}")
|
||||
print("=" * 80)
|
||||
print(f"Name: {src['name']}")
|
||||
print(f"Length: {src['doc_chars']:,} chars")
|
||||
print()
|
||||
print("Mistral metadata:")
|
||||
print()
|
||||
render_metadata(src.get('metadata', {}))
|
||||
print()
|
||||
print("-" * 80)
|
||||
|
||||
def get_rating():
|
||||
while True:
|
||||
binary = input("Single-domain or multi-domain? [s/m/skip/quit]: ").strip().lower()
|
||||
if binary in ('s', 'm', 'skip', 'quit'):
|
||||
break
|
||||
print(" Please enter 's', 'm', 'skip', or 'quit'")
|
||||
|
||||
if binary == 'quit':
|
||||
return 'quit'
|
||||
if binary == 'skip':
|
||||
return None
|
||||
|
||||
while True:
|
||||
try:
|
||||
score_input = input("Purity score (1=many frames, 5=clearly single): ").strip()
|
||||
if score_input.lower() == 'quit':
|
||||
return 'quit'
|
||||
score = int(score_input)
|
||||
if 1 <= score <= 5:
|
||||
break
|
||||
print(" Score must be 1-5")
|
||||
except ValueError:
|
||||
print(" Please enter a number 1-5 (or 'quit')")
|
||||
|
||||
note = input("Optional note (Enter to skip): ").strip()
|
||||
|
||||
return {
|
||||
"binary": "single" if binary == 's' else "multi",
|
||||
"score": score,
|
||||
"note": note if note else None,
|
||||
}
|
||||
|
||||
def main():
|
||||
with open(E14_RESULTS) as f:
|
||||
e14 = json.load(f)
|
||||
|
||||
sources = [r for r in e14['results'] if 'submit_result' in r]
|
||||
rng = random.Random(42)
|
||||
shuffled = list(sources)
|
||||
rng.shuffle(shuffled)
|
||||
|
||||
state = load_existing()
|
||||
completed = set(state['completed_names'])
|
||||
remaining = [s for s in shuffled if s['name'] not in completed]
|
||||
|
||||
print(INTRO)
|
||||
print()
|
||||
print(f"Total sources: {len(sources)}")
|
||||
print(f"Already rated: {len(completed)}")
|
||||
print(f"Remaining: {len(remaining)}")
|
||||
print()
|
||||
if not remaining:
|
||||
print("All sources rated. Run analysis script next.")
|
||||
return
|
||||
|
||||
input("Press Enter to begin...")
|
||||
|
||||
try:
|
||||
for i, src in enumerate(remaining, start=len(completed) + 1):
|
||||
render_source(src, i, len(sources))
|
||||
try:
|
||||
rating = get_rating()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\n\nSaving and exiting...")
|
||||
save(state)
|
||||
return
|
||||
|
||||
if rating == 'quit':
|
||||
print("\nSaving and exiting...")
|
||||
save(state)
|
||||
return
|
||||
if rating is None:
|
||||
print(" Skipped")
|
||||
continue
|
||||
|
||||
rating['name'] = src['name']
|
||||
state['ratings'].append(rating)
|
||||
state['completed_names'].append(src['name'])
|
||||
save(state)
|
||||
print(f" Recorded: {rating['binary']}-domain, score={rating['score']}")
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f"Done. Rated {len(state['ratings'])} sources.")
|
||||
print(f"Saved to {RATINGS_OUT}")
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\n\nSaving...")
|
||||
save(state)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 metrics comparison — A (Tier 1 aaron) vs B (cascade aaron_cascade_test) on the 10 sample sources."""
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
COMPARISON_FILE = EXPERIMENTS / "cascade_reextract_comparison.json"
|
||||
|
||||
def query(group_id, cypher):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
def parse_int_result(output):
|
||||
"""Parse a single-integer result from redis-cli GRAPH.QUERY output."""
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
def parse_string_list(output):
|
||||
"""Parse a list of strings from redis-cli output (skipping headers and timing)."""
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
items = []
|
||||
started = False
|
||||
for line in lines:
|
||||
if line.startswith("Cached") or line.startswith("Query internal"):
|
||||
break
|
||||
if started:
|
||||
items.append(line)
|
||||
# The header is the column name; everything after is data
|
||||
# But we don't know column names a priori, so detect transition by length pattern
|
||||
if not started and len(line) < 60 and not any(c in line for c in "{}[]"):
|
||||
# Likely a header row, skip first one
|
||||
started = True
|
||||
return items
|
||||
|
||||
def metrics_for_source(group_id, source_name):
|
||||
"""Get metrics for one source's episode in one group_id."""
|
||||
# Total entities connected to this episode
|
||||
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity) RETURN count(distinct n) AS entities'
|
||||
entities = parse_int_result(query(group_id, q))
|
||||
|
||||
# Total edges from this episode (all relationship types)
|
||||
q = f'MATCH (e:Episodic {{name: "{source_name}"}})-[r]-() RETURN count(r) AS edges'
|
||||
edges = parse_int_result(query(group_id, q))
|
||||
|
||||
# Distinct relationship types in edges from entities of this episode
|
||||
q = (f'MATCH (e:Episodic {{name: "{source_name}"}})-[]-(n:Entity)-[r]-() '
|
||||
f'RETURN count(distinct type(r)) AS types')
|
||||
rel_types = parse_int_result(query(group_id, q))
|
||||
|
||||
return {"entities": entities, "edges": edges, "rel_types": rel_types}
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
|
||||
print(f"E1 metrics comparison — {len(selected)} sources, A=aaron vs B=aaron_cascade_test\n")
|
||||
print(f"{'Source':<60} {'A.ent':>6} {'B.ent':>6} {'A.edg':>6} {'B.edg':>6} {'A.typ':>6} {'B.typ':>6}")
|
||||
print("-" * 110)
|
||||
|
||||
results = []
|
||||
for ep in selected:
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
a = metrics_for_source("aaron", name)
|
||||
b = metrics_for_source("aaron_cascade_test", name)
|
||||
record = {
|
||||
"name": name, "bucket": bucket,
|
||||
"a_entities": a["entities"], "b_entities": b["entities"],
|
||||
"a_edges": a["edges"], "b_edges": b["edges"],
|
||||
"a_rel_types": a["rel_types"], "b_rel_types": b["rel_types"],
|
||||
}
|
||||
results.append(record)
|
||||
# Truncate name for display
|
||||
display_name = name if len(name) <= 58 else name[:55] + "..."
|
||||
print(f"{display_name:<60} {a['entities']:>6} {b['entities']:>6} {a['edges']:>6} {b['edges']:>6} {a['rel_types']:>6} {b['rel_types']:>6}")
|
||||
|
||||
# Aggregates
|
||||
print("\n" + "=" * 110)
|
||||
n = len(results)
|
||||
a_ent_sum = sum(r["a_entities"] for r in results)
|
||||
b_ent_sum = sum(r["b_entities"] for r in results)
|
||||
a_edge_sum = sum(r["a_edges"] for r in results)
|
||||
b_edge_sum = sum(r["b_edges"] for r in results)
|
||||
a_types_sum = sum(r["a_rel_types"] for r in results)
|
||||
b_types_sum = sum(r["b_rel_types"] for r in results)
|
||||
print(f"\nAggregate (n={n}):")
|
||||
print(f" Entities: A mean={a_ent_sum/n:.1f} B mean={b_ent_sum/n:.1f} delta={(b_ent_sum-a_ent_sum)/a_ent_sum*100:+.1f}%")
|
||||
print(f" Edges: A mean={a_edge_sum/n:.1f} B mean={b_edge_sum/n:.1f} delta={(b_edge_sum-a_edge_sum)/a_edge_sum*100:+.1f}%")
|
||||
print(f" Rel types: A mean={a_types_sum/n:.1f} B mean={b_types_sum/n:.1f} delta={(b_types_sum-a_types_sum)/a_types_sum*100:+.1f}%")
|
||||
|
||||
# Global predicate diversity check (unique types in each group_id)
|
||||
print(f"\nGlobal predicate diversity:")
|
||||
a_global = parse_int_result(query("aaron", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
|
||||
b_global = parse_int_result(query("aaron_cascade_test", "MATCH ()-[r]-() RETURN count(distinct type(r)) AS t"))
|
||||
print(f" A (aaron): {a_global} distinct relationship types across whole graph")
|
||||
print(f" B (aaron_cascade_test): {b_global} distinct relationship types across whole graph")
|
||||
|
||||
# Per-bucket
|
||||
print(f"\nPer-bucket aggregates:")
|
||||
for bucket in ["high", "mid", "low", "document"]:
|
||||
bucket_results = [r for r in results if r["bucket"] == bucket]
|
||||
if not bucket_results:
|
||||
continue
|
||||
bn = len(bucket_results)
|
||||
a_e = sum(r["a_entities"] for r in bucket_results) / bn
|
||||
b_e = sum(r["b_entities"] for r in bucket_results) / bn
|
||||
a_ed = sum(r["a_edges"] for r in bucket_results) / bn
|
||||
b_ed = sum(r["b_edges"] for r in bucket_results) / bn
|
||||
print(f" [{bucket:>8}] n={bn} A.ent={a_e:.1f} B.ent={b_e:.1f} ({(b_e-a_e)/a_e*100:+.0f}%) "
|
||||
f"A.edg={a_ed:.1f} B.edg={b_ed:.1f} ({(b_ed-a_ed)/a_ed*100:+.0f}%)")
|
||||
|
||||
with open(COMPARISON_FILE, "w") as f:
|
||||
json.dump({
|
||||
"results": results,
|
||||
"aggregate": {
|
||||
"a_entities_total": a_ent_sum, "b_entities_total": b_ent_sum,
|
||||
"a_edges_total": a_edge_sum, "b_edges_total": b_edge_sum,
|
||||
"global_predicate_diversity": {"a": a_global, "b": b_global},
|
||||
},
|
||||
}, f, indent=2)
|
||||
print(f"\nSaved to {COMPARISON_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 corrected metric — count distinct predicate names on edges originating from each episode."""
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
|
||||
def query(group_id, cypher):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", group_id, cypher],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
def get_episode_uuid(group_id, episode_name):
|
||||
"""Look up the UUID for a given episode name in a given group."""
|
||||
# Escape single quotes in the name
|
||||
safe = episode_name.replace("'", "\\'")
|
||||
cypher = f"MATCH (e:Episodic) WHERE e.name = '{safe}' RETURN e.uuid LIMIT 1"
|
||||
output = query(group_id, cypher)
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
# UUID format check
|
||||
if len(line) == 36 and line.count("-") == 4:
|
||||
return line
|
||||
return None
|
||||
|
||||
def count_predicates_for_episode(group_id, uuid):
|
||||
"""Count distinct predicate names on edges where this episode UUID appears in r.episodes."""
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(distinct r.name) AS p"
|
||||
output = query(group_id, cypher)
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
def count_total_edges_for_episode(group_id, uuid):
|
||||
"""Count total edges originating from this episode."""
|
||||
cypher = f"MATCH ()-[r:RELATES_TO]->() WHERE '{uuid}' IN r.episodes RETURN count(r) AS n"
|
||||
output = query(group_id, cypher)
|
||||
lines = [l.strip() for l in output.split("\n") if l.strip()]
|
||||
for line in lines:
|
||||
if line.isdigit():
|
||||
return int(line)
|
||||
return 0
|
||||
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
|
||||
print(f"E1 corrected per-source comparison — predicates per episode by edge origin\n")
|
||||
print(f"{'Source':<60} {'A.edges':>8} {'A.preds':>8} {'B.edges':>8} {'B.preds':>8}")
|
||||
print("-" * 100)
|
||||
|
||||
a_pred_total = 0
|
||||
b_pred_total = 0
|
||||
a_edge_total = 0
|
||||
b_edge_total = 0
|
||||
records = []
|
||||
|
||||
for ep in selected:
|
||||
name = ep["name"]
|
||||
a_uuid = get_episode_uuid("aaron", name)
|
||||
b_uuid = get_episode_uuid("aaron_cascade_test", name)
|
||||
|
||||
a_edges = count_total_edges_for_episode("aaron", a_uuid) if a_uuid else 0
|
||||
a_preds = count_predicates_for_episode("aaron", a_uuid) if a_uuid else 0
|
||||
b_edges = count_total_edges_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
|
||||
b_preds = count_predicates_for_episode("aaron_cascade_test", b_uuid) if b_uuid else 0
|
||||
|
||||
display = name if len(name) <= 58 else name[:55] + "..."
|
||||
print(f"{display:<60} {a_edges:>8} {a_preds:>8} {b_edges:>8} {b_preds:>8}")
|
||||
|
||||
records.append({
|
||||
"name": name, "bucket": ep["bucket"],
|
||||
"a_edges": a_edges, "a_preds": a_preds,
|
||||
"b_edges": b_edges, "b_preds": b_preds,
|
||||
})
|
||||
a_pred_total += a_preds
|
||||
b_pred_total += b_preds
|
||||
a_edge_total += a_edges
|
||||
b_edge_total += b_edges
|
||||
|
||||
print("-" * 100)
|
||||
n = len(selected)
|
||||
print(f"\nAggregate (n={n}):")
|
||||
print(f" Edges: A total={a_edge_total} mean={a_edge_total/n:.1f} B total={b_edge_total} mean={b_edge_total/n:.1f}")
|
||||
print(f" Predicates: A total={a_pred_total} mean={a_pred_total/n:.1f} B total={b_pred_total} mean={b_pred_total/n:.1f}")
|
||||
if a_pred_total > 0:
|
||||
print(f" Predicate delta: B vs A = {(b_pred_total-a_pred_total)/a_pred_total*100:+.1f}%")
|
||||
if a_edge_total > 0:
|
||||
print(f" Edge delta: B vs A = {(b_edge_total-a_edge_total)/a_edge_total*100:+.1f}%")
|
||||
|
||||
# Per-bucket
|
||||
print(f"\nPer-bucket:")
|
||||
for bucket in ["high", "mid", "low", "document"]:
|
||||
bucket_records = [r for r in records if r["bucket"] == bucket]
|
||||
if not bucket_records:
|
||||
continue
|
||||
bn = len(bucket_records)
|
||||
a_p = sum(r["a_preds"] for r in bucket_records)
|
||||
b_p = sum(r["b_preds"] for r in bucket_records)
|
||||
a_e = sum(r["a_edges"] for r in bucket_records)
|
||||
b_e = sum(r["b_edges"] for r in bucket_records)
|
||||
delta = ((b_p-a_p)/a_p*100) if a_p > 0 else 0
|
||||
print(f" [{bucket:>8}] n={bn} A.preds={a_p:>3} B.preds={b_p:>3} ({delta:+.0f}%) A.edges={a_e:>3} B.edges={b_e:>3}")
|
||||
|
||||
with open(EXPERIMENTS / "cascade_reextract_corrected_comparison.json", "w") as f:
|
||||
json.dump({"per_source": records,
|
||||
"aggregate": {"a_preds": a_pred_total, "b_preds": b_pred_total,
|
||||
"a_edges": a_edge_total, "b_edges": b_edge_total}}, f, indent=2)
|
||||
print(f"\nSaved to {EXPERIMENTS / 'cascade_reextract_corrected_comparison.json'}")
|
||||
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 orchestration — fetch source text, run Mistral metadata, submit to Graphiti test group_id."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_test"
|
||||
MAX_DOC_CHARS = 12000 # Same cap as Tier 1 for parity
|
||||
|
||||
# Stage 2 metadata prompt — verbatim from stage-2-worker-spec.md
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
"""Reassemble the full document from pgvector chunks, mirroring tier1_migration.py logic."""
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text):
|
||||
"""Call local Mistral via Ollama for base-class metadata."""
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=180,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
# Override char_length with python-computed value (per stage-2-worker-spec)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
"""Format the base-class metadata as a source_description for Graphiti, with orient-not-bound framing."""
|
||||
if "error" in metadata:
|
||||
return f"tier1_cascade_test (metadata generation failed: {metadata['error']})"
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode(name, content, source_description):
|
||||
"""Submit episode to Graphiti sidecar at the test group_id."""
|
||||
payload = {
|
||||
"episodes": [{
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": source_description,
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
}],
|
||||
"group_id": TEST_GROUP_ID,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes/bulk", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
print(f"E1 cascade re-extraction starting — {len(selected)} episodes to test group_id={TEST_GROUP_ID}\n")
|
||||
|
||||
results = []
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
|
||||
# Fetch text
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
# Mistral metadata
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
# Submit to Graphiti
|
||||
source_desc = format_metadata_as_orientation(metadata)
|
||||
record["source_description"] = source_desc
|
||||
print(f" Submitting to Graphiti test group...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode(name, text, source_desc)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
# Save intermediate state after each episode
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 corrected re-run — cascade orientation passed via custom_extraction_instructions."""
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import time
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
SAMPLE_FILE = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
RESULTS_FILE = EXPERIMENTS / "cascade_reextract_results.json"
|
||||
PG_DSN = os.environ["PG_DSN"]
|
||||
SIDECAR_URL = "http://localhost:8001"
|
||||
TEST_GROUP_ID = "aaron_cascade_test"
|
||||
MAX_DOC_CHARS = 12000
|
||||
|
||||
METADATA_PROMPT = """You are a metadata extraction system. Given a document, produce structural and content metadata in strict JSON format.
|
||||
|
||||
Do not summarize the content beyond the one-sentence summary field. Do not extract entities or relationships. Do not interpret meaning. Produce only the metadata schema below.
|
||||
|
||||
Output JSON only. No prose, no explanation, no markdown code fences.
|
||||
|
||||
Schema:
|
||||
{
|
||||
"language": "<ISO 639-1 code>",
|
||||
"char_length": <integer>,
|
||||
"primary_format": "<prose|slides|code|structured|mixed>",
|
||||
"structural_signals": {
|
||||
"has_headings": <boolean>,
|
||||
"has_bullet_lists": <boolean>,
|
||||
"has_numbered_lists": <boolean>,
|
||||
"has_tables": <boolean>,
|
||||
"has_code_blocks": <boolean>,
|
||||
"has_dates": <boolean>
|
||||
},
|
||||
"content_signals": {
|
||||
"has_named_people": <boolean>,
|
||||
"has_institutional_language": <boolean>,
|
||||
"has_technical_terminology": <boolean>,
|
||||
"has_first_person": <boolean>,
|
||||
"has_quotations": <boolean>
|
||||
},
|
||||
"domain_class": "<technical|administrative|educational|personal|conversational>",
|
||||
"one_sentence_summary": "<one sentence describing what the document is about>"
|
||||
}
|
||||
|
||||
Document:
|
||||
"""
|
||||
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
|
||||
def fetch_source_text(source):
|
||||
conn = get_pg()
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT STRING_AGG(document, E'\n\n' ORDER BY id) AS full_doc
|
||||
FROM embeddings WHERE source = %s
|
||||
""", (source,))
|
||||
row = cur.fetchone()
|
||||
conn.close()
|
||||
if row is None or row[0] is None:
|
||||
return None
|
||||
return row[0]
|
||||
|
||||
|
||||
def run_mistral_metadata(text):
|
||||
truncated = text[:MAX_DOC_CHARS]
|
||||
prompt = METADATA_PROMPT + truncated
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/generate",
|
||||
json={"model": "mistral:latest", "prompt": prompt, "stream": False, "format": "json"},
|
||||
timeout=180,
|
||||
)
|
||||
response.raise_for_status()
|
||||
raw = response.json()["response"]
|
||||
try:
|
||||
metadata = json.loads(raw)
|
||||
metadata["char_length"] = len(truncated)
|
||||
return metadata
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "JSON parse failed", "raw": raw[:500]}
|
||||
|
||||
|
||||
def format_metadata_as_orientation(metadata):
|
||||
"""Format metadata as orient-not-bound extraction instructions."""
|
||||
if "error" in metadata:
|
||||
return None
|
||||
summary = metadata.get("one_sentence_summary", "")
|
||||
domain = metadata.get("domain_class", "unknown")
|
||||
fmt = metadata.get("primary_format", "unknown")
|
||||
return (
|
||||
f"This is a {domain} document in {fmt} format. "
|
||||
f"Summary: {summary} "
|
||||
f"This metadata is provided to orient your extraction, not to constrain it. "
|
||||
f"Extract entities and relationships freely from the document text itself; "
|
||||
f"the metadata is descriptive context, not a checklist."
|
||||
)
|
||||
|
||||
|
||||
def submit_episode_singular(name, content, custom_instructions):
|
||||
"""Submit episode to Graphiti's singular /episodes endpoint with cascade orientation."""
|
||||
payload = {
|
||||
"name": name,
|
||||
"content": content[:MAX_DOC_CHARS],
|
||||
"source_description": "e1_corrected_run", # neutral label, not the cascade text
|
||||
"timestamp": "2026-04-28T00:00:00",
|
||||
"group_id": TEST_GROUP_ID,
|
||||
"custom_extraction_instructions": custom_instructions,
|
||||
}
|
||||
response = requests.post(f"{SIDECAR_URL}/episodes", json=payload, timeout=300)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
with open(SAMPLE_FILE) as f:
|
||||
sample = json.load(f)
|
||||
selected = sample["selected"]
|
||||
print(f"E1 CORRECTED re-run — {len(selected)} episodes via /episodes (singular)")
|
||||
print(f"Cascade orientation passed in custom_extraction_instructions.\n")
|
||||
|
||||
results = []
|
||||
for i, ep in enumerate(selected, 1):
|
||||
name = ep["name"]
|
||||
bucket = ep["bucket"]
|
||||
print(f"[{i}/{len(selected)}] [{bucket}] {name}")
|
||||
record = {"name": name, "bucket": bucket, "tier1_entities": ep["entities"]}
|
||||
|
||||
print(f" Fetching source text...", end=" ", flush=True)
|
||||
text = fetch_source_text(name)
|
||||
if text is None:
|
||||
print("FAILED — no chunks in pgvector")
|
||||
record["error"] = "no source text"
|
||||
results.append(record)
|
||||
continue
|
||||
record["doc_chars"] = len(text)
|
||||
print(f"{len(text)} chars")
|
||||
|
||||
print(f" Generating Mistral metadata...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
metadata = run_mistral_metadata(text)
|
||||
elapsed = time.time() - t0
|
||||
record["metadata"] = metadata
|
||||
record["metadata_elapsed_s"] = round(elapsed, 1)
|
||||
if "error" in metadata:
|
||||
print(f"FAILED in {elapsed:.1f}s")
|
||||
else:
|
||||
print(f"{elapsed:.1f}s — domain={metadata.get('domain_class')}, format={metadata.get('primary_format')}")
|
||||
|
||||
custom_instructions = format_metadata_as_orientation(metadata)
|
||||
record["custom_extraction_instructions"] = custom_instructions
|
||||
print(f" Submitting via /episodes (singular) with custom_extraction_instructions...", end=" ", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
result = submit_episode_singular(name, text, custom_instructions)
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — OK")
|
||||
record["submit_elapsed_s"] = round(elapsed, 1)
|
||||
record["submit_result"] = result
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
print(f"{elapsed:.1f}s — FAILED: {e}")
|
||||
record["submit_error"] = str(e)
|
||||
|
||||
results.append(record)
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump({"results": results}, f, indent=2, default=str)
|
||||
print()
|
||||
|
||||
print(f"\nDone. Results saved to {RESULTS_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E1 sample selection — pick 10 episodes from Tier 1 stratified by density and type."""
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
EXPERIMENTS = Path.home() / "aaronai" / "experiments"
|
||||
OUTPUT = EXPERIMENTS / "cascade_reextract_sample.json"
|
||||
|
||||
# Get all Tier 1 episodes with their entity counts via FalkorDB
|
||||
def query_episode_counts():
|
||||
query = ("MATCH (e:Episodic) OPTIONAL MATCH (e)-[r]-(n:Entity) "
|
||||
"RETURN e.name AS name, count(distinct n) AS entities "
|
||||
"ORDER BY entities DESC")
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
# Parse the output — redis-cli returns rows after a header
|
||||
lines = [l for l in result.stdout.split("\n") if l.strip()]
|
||||
episodes = []
|
||||
# Skip header rows ("name", "entities") and timing rows
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if lines[i] == "name":
|
||||
i += 2 # skip "name" and "entities" headers
|
||||
continue
|
||||
if lines[i].startswith("Cached") or lines[i].startswith("Query"):
|
||||
break
|
||||
# Each episode: name on one line, count on next
|
||||
if i + 1 < len(lines):
|
||||
try:
|
||||
count = int(lines[i + 1])
|
||||
episodes.append({"name": lines[i], "entities": count})
|
||||
i += 2
|
||||
except ValueError:
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
return episodes
|
||||
|
||||
print("Fetching episode entity counts from FalkorDB...")
|
||||
episodes = query_episode_counts()
|
||||
print(f"Got {len(episodes)} episodes")
|
||||
|
||||
# Classify by density bucket and type
|
||||
def is_document(name):
|
||||
doc_extensions = (".pdf", ".docx", ".pptx", ".txt", ".md")
|
||||
return any(name.lower().endswith(ext) for ext in doc_extensions)
|
||||
|
||||
# Compute quartile boundaries from the entity counts
|
||||
counts = sorted([e["entities"] for e in episodes], reverse=True)
|
||||
n = len(counts)
|
||||
top_q = counts[n // 4] # 25th percentile from top
|
||||
bottom_q = counts[3 * n // 4] # 75th percentile from top
|
||||
|
||||
print(f"\nQuartile boundaries: top={top_q}+, middle=({bottom_q+1}-{top_q-1}), bottom=0-{bottom_q}")
|
||||
|
||||
high = [e for e in episodes if e["entities"] >= top_q and not is_document(e["name"])]
|
||||
mid = [e for e in episodes if bottom_q < e["entities"] < top_q and not is_document(e["name"])]
|
||||
low = [e for e in episodes if e["entities"] <= bottom_q and not is_document(e["name"])]
|
||||
docs = [e for e in episodes if is_document(e["name"]) and e["entities"] >= 5]
|
||||
|
||||
print(f"High-density conversations: {len(high)}")
|
||||
print(f"Mid-density conversations: {len(mid)}")
|
||||
print(f"Low-density conversations: {len(low)}")
|
||||
print(f"Documents (≥5 entities): {len(docs)}")
|
||||
|
||||
# Deterministic selection — take from middle of each bucket to avoid edge cases
|
||||
def pick(bucket, n):
|
||||
if len(bucket) < n:
|
||||
return bucket
|
||||
mid_idx = len(bucket) // 2
|
||||
start = max(0, mid_idx - n // 2)
|
||||
return bucket[start:start + n]
|
||||
|
||||
selected = (
|
||||
pick(high, 3) +
|
||||
pick(mid, 3) +
|
||||
pick(low, 2) +
|
||||
pick(docs, 2)
|
||||
)
|
||||
|
||||
# Tag each with its bucket
|
||||
def bucket_for(ep):
|
||||
if is_document(ep["name"]):
|
||||
return "document"
|
||||
if ep["entities"] >= top_q:
|
||||
return "high"
|
||||
if ep["entities"] > bottom_q:
|
||||
return "mid"
|
||||
return "low"
|
||||
|
||||
for ep in selected:
|
||||
ep["bucket"] = bucket_for(ep)
|
||||
|
||||
print(f"\nSelected {len(selected)} episodes for E1:")
|
||||
for ep in selected:
|
||||
print(f" [{ep['bucket']:>8}] {ep['entities']:>3}e {ep['name']}")
|
||||
|
||||
# Save selection
|
||||
with open(OUTPUT, "w") as f:
|
||||
json.dump({
|
||||
"metadata": {
|
||||
"purpose": "E1 cascade re-extraction sample (n=10)",
|
||||
"stratification": "density buckets + document subset",
|
||||
"quartile_top": top_q,
|
||||
"quartile_bottom": bottom_q,
|
||||
"total_tier1_episodes": len(episodes),
|
||||
},
|
||||
"selected": selected,
|
||||
}, f, indent=2)
|
||||
|
||||
print(f"\nSaved to {OUTPUT}")
|
||||
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2 follow-up: confirm Aaron AI alias situation, find other potential duplicates."""
|
||||
import subprocess
|
||||
|
||||
QUERIES = [
|
||||
("Aaron AI variants",
|
||||
"MATCH (n:Entity) WHERE n.name CONTAINS 'Aaron AI' OR n.name CONTAINS 'ARIN' OR n.name CONTAINS 'RNAI' RETURN n.name, n.summary"),
|
||||
("All Mossygear-named entities",
|
||||
"MATCH (n:Entity) WHERE n.name CONTAINS 'Mossy' OR n.name CONTAINS 'A+K' OR n.name CONTAINS 'AK Design' RETURN n.name, n.summary"),
|
||||
("Total entity count check",
|
||||
"MATCH (n:Entity) RETURN count(n) as total"),
|
||||
("Top 30 entity names by edge count",
|
||||
"MATCH (n:Entity)-[r]-() RETURN n.name, count(r) as edges ORDER BY edges DESC LIMIT 30"),
|
||||
]
|
||||
|
||||
for label, query in QUERIES:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"QUERY: {label}")
|
||||
print('=' * 60)
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
print(result.stdout)
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2: Entity resolution diagnostic. Queries Graphiti's FalkorDB for the six test entities."""
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
TEST_ENTITIES = ["Aaron", "Kat", "HVAMC", "Bird", "Susan Hamlet", "Tulsa album"]
|
||||
|
||||
def run_cypher(query):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
for name in TEST_ENTITIES:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"ENTITY: {name}")
|
||||
print('=' * 60)
|
||||
query = f"MATCH (n:Entity) WHERE n.name CONTAINS '{name}' RETURN n.name, n.summary"
|
||||
print(run_cypher(query))
|
||||
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
"""E2 follow-up: how many distinct episodes connect to each entity?"""
|
||||
import subprocess
|
||||
|
||||
QUERIES = [
|
||||
("Aaron", "MATCH (n:Entity {name: 'Aaron'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Nelson", "MATCH (n:Entity {name: 'Nelson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("HVAMC", "MATCH (n:Entity {name: 'HVAMC'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Bird", "MATCH (n:Entity {name: 'Bird'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Tulsa album", "MATCH (n:Entity {name: 'Tulsa album'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Susan Hamlet", "MATCH (n:Entity {name: 'Susan Hamlet'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Kat", "MATCH (n:Entity {name: 'Kat'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
("Katherine Wilson","MATCH (n:Entity {name: 'Katherine Wilson'})-[]-(e:Episodic) RETURN DISTINCT e.name LIMIT 30"),
|
||||
]
|
||||
|
||||
for label, query in QUERIES:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"ENTITY: {label}")
|
||||
print('=' * 60)
|
||||
result = subprocess.run(
|
||||
["docker", "exec", "falkordb", "redis-cli", "GRAPH.QUERY", "aaron", query],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
print(result.stdout)
|
||||
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Experiment 005 — Actual API Token Measurement
|
||||
|
||||
Measures input token reduction from prepending v2 briefing vs raw document
|
||||
on Claude Haiku, validating the 42.0% modeled estimate from Experiment 002b.
|
||||
|
||||
Outputs: ~/aaronai/experiments/token_measurement_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import anthropic
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
INPUT_FILE = Path.home() / "aaronai" / "briefing_test_v2_results.json"
|
||||
OUTPUT_FILE = Path.home() / "aaronai" / "experiments" / "token_measurement_results.json"
|
||||
MODEL = "claude-haiku-4-5-20251001"
|
||||
MAX_TOKENS = 1024
|
||||
|
||||
EXTRACTION_PROMPT = (
|
||||
"Extract entities and their relationships from the document below. "
|
||||
"Return ONLY valid JSON with this schema:\n"
|
||||
"{\n"
|
||||
' "people": [string],\n'
|
||||
' "organizations": [string],\n'
|
||||
' "locations": [string],\n'
|
||||
' "dates": [string],\n'
|
||||
' "relationships": [{"subject": string, "predicate": string, "object": string}]\n'
|
||||
"}\n"
|
||||
"No prose, no markdown fences, no commentary. JSON only."
|
||||
)
|
||||
|
||||
|
||||
def fetch_document_text(pg_conn, source):
|
||||
"""Reconstruct the document by concatenating its chunks from pgvector."""
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT document FROM embeddings WHERE source = %s ORDER BY id",
|
||||
(source,),
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
if not rows:
|
||||
return None
|
||||
return "\n\n".join(r[0] for r in rows)
|
||||
|
||||
|
||||
def build_raw_message(document_text):
|
||||
return f"{EXTRACTION_PROMPT}\n\nDOCUMENT:\n{document_text}"
|
||||
|
||||
|
||||
def build_briefed_message(briefing, document_text):
|
||||
briefing_str = json.dumps(briefing, indent=2)
|
||||
return (
|
||||
f"{EXTRACTION_PROMPT}\n\n"
|
||||
f"BRIEFING (pre-analysis from local model — use to orient):\n{briefing_str}\n\n"
|
||||
f"DOCUMENT:\n{document_text}"
|
||||
)
|
||||
|
||||
|
||||
def call_haiku(client, message_text):
|
||||
t0 = time.time()
|
||||
resp = client.messages.create(
|
||||
model=MODEL,
|
||||
max_tokens=MAX_TOKENS,
|
||||
messages=[{"role": "user", "content": message_text}],
|
||||
)
|
||||
return {
|
||||
"input_tokens": resp.usage.input_tokens,
|
||||
"output_tokens": resp.usage.output_tokens,
|
||||
"latency_s": round(time.time() - t0, 2),
|
||||
"response_text": resp.content[0].text if resp.content else "",
|
||||
"stop_reason": resp.stop_reason,
|
||||
}
|
||||
|
||||
|
||||
def ci_95(values):
|
||||
if len(values) < 2:
|
||||
return (statistics.mean(values) if values else 0.0, 0.0)
|
||||
mean = statistics.mean(values)
|
||||
half = 1.96 * statistics.stdev(values) / (len(values) ** 0.5)
|
||||
return (mean, half)
|
||||
|
||||
|
||||
def main():
|
||||
if not INPUT_FILE.exists():
|
||||
print(f"ERROR: {INPUT_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
pg_dsn = os.environ.get("PG_DSN")
|
||||
if not pg_dsn:
|
||||
print("ERROR: PG_DSN not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
client = anthropic.Anthropic(api_key=api_key)
|
||||
pg_conn = psycopg2.connect(pg_dsn)
|
||||
|
||||
with open(INPUT_FILE) as f:
|
||||
v2_data = json.load(f)
|
||||
|
||||
docs_meta = [
|
||||
d for d in v2_data["documents"]
|
||||
if d.get("status") == "SUCCESS"
|
||||
and d.get("briefing")
|
||||
]
|
||||
|
||||
print(f"Loaded {len(docs_meta)} successful briefings from {INPUT_FILE.name}")
|
||||
print(f"Model: {MODEL}")
|
||||
print(f"Calls planned: up to {len(docs_meta) * 2}\n")
|
||||
|
||||
results = []
|
||||
started_at = datetime.now(timezone.utc).isoformat()
|
||||
t_total = time.time()
|
||||
|
||||
for i, doc in enumerate(docs_meta, 1):
|
||||
source = doc["source"]
|
||||
briefing = doc["briefing"]
|
||||
|
||||
document_text = fetch_document_text(pg_conn, source)
|
||||
if not document_text:
|
||||
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]} -- SKIP (not in pgvector)")
|
||||
results.append({"source": source, "skipped": "not_in_pgvector"})
|
||||
continue
|
||||
|
||||
print(f"[{i:02d}/{len(docs_meta)}] {source[:60]}")
|
||||
|
||||
try:
|
||||
raw_result = call_haiku(client, build_raw_message(document_text))
|
||||
except Exception as e:
|
||||
print(f" RAW FAILED: {e}")
|
||||
raw_result = {"error": str(e)}
|
||||
|
||||
try:
|
||||
briefed_result = call_haiku(client, build_briefed_message(briefing, document_text))
|
||||
except Exception as e:
|
||||
print(f" BRIEFED FAILED: {e}")
|
||||
briefed_result = {"error": str(e)}
|
||||
|
||||
delta = None
|
||||
if "input_tokens" in raw_result and "input_tokens" in briefed_result:
|
||||
raw_in = raw_result["input_tokens"]
|
||||
briefed_in = briefed_result["input_tokens"]
|
||||
raw_out = raw_result["output_tokens"]
|
||||
briefed_out = briefed_result["output_tokens"]
|
||||
input_red = (raw_in - briefed_in) / raw_in * 100 if raw_in else 0.0
|
||||
output_delta = (briefed_out - raw_out) / raw_out * 100 if raw_out else 0.0
|
||||
delta = {
|
||||
"input_reduction_pct": round(input_red, 2),
|
||||
"output_delta_pct": round(output_delta, 2),
|
||||
"raw_input_tokens": raw_in,
|
||||
"briefed_input_tokens": briefed_in,
|
||||
"raw_output_tokens": raw_out,
|
||||
"briefed_output_tokens": briefed_out,
|
||||
}
|
||||
print(
|
||||
f" in: {raw_in} -> {briefed_in} ({input_red:+.1f}%) | "
|
||||
f"out: {raw_out} -> {briefed_out}"
|
||||
)
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"raw": raw_result,
|
||||
"briefed": briefed_result,
|
||||
"delta": delta,
|
||||
})
|
||||
|
||||
pg_conn.close()
|
||||
total_elapsed = round(time.time() - t_total, 1)
|
||||
|
||||
valid = [r for r in results if r.get("delta") is not None]
|
||||
skipped = [r for r in results if r.get("skipped")]
|
||||
reductions = [r["delta"]["input_reduction_pct"] for r in valid]
|
||||
output_deltas = [r["delta"]["output_delta_pct"] for r in valid]
|
||||
raw_in_total = sum(r["delta"]["raw_input_tokens"] for r in valid)
|
||||
briefed_in_total = sum(r["delta"]["briefed_input_tokens"] for r in valid)
|
||||
raw_out_total = sum(r["delta"]["raw_output_tokens"] for r in valid)
|
||||
briefed_out_total = sum(r["delta"]["briefed_output_tokens"] for r in valid)
|
||||
|
||||
HAIKU_IN = 1.0
|
||||
HAIKU_OUT = 5.0
|
||||
raw_cost = (raw_in_total * HAIKU_IN + raw_out_total * HAIKU_OUT) / 1_000_000
|
||||
briefed_cost = (briefed_in_total * HAIKU_IN + briefed_out_total * HAIKU_OUT) / 1_000_000
|
||||
|
||||
mean_red, ci_half = ci_95(reductions)
|
||||
mean_out_delta, _ = ci_95(output_deltas)
|
||||
|
||||
summary = {
|
||||
"experiment": "005",
|
||||
"title": "Actual API Token Measurement",
|
||||
"started_at": started_at,
|
||||
"completed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"model": MODEL,
|
||||
"extraction_prompt": EXTRACTION_PROMPT,
|
||||
"n_documents_attempted": len(docs_meta),
|
||||
"n_skipped_not_in_pgvector": len(skipped),
|
||||
"n_valid_pairs": len(valid),
|
||||
"n_failed": len(docs_meta) - len(valid) - len(skipped),
|
||||
"total_elapsed_s": total_elapsed,
|
||||
"input_token_reduction": {
|
||||
"mean_pct": round(mean_red, 2),
|
||||
"ci_95_half_width_pct": round(ci_half, 2),
|
||||
"median_pct": round(statistics.median(reductions), 2) if reductions else None,
|
||||
"min_pct": round(min(reductions), 2) if reductions else None,
|
||||
"max_pct": round(max(reductions), 2) if reductions else None,
|
||||
"stdev_pct": round(statistics.stdev(reductions), 2) if len(reductions) > 1 else 0.0,
|
||||
},
|
||||
"output_token_delta": {"mean_pct": round(mean_out_delta, 2)},
|
||||
"totals": {
|
||||
"raw_input_tokens": raw_in_total,
|
||||
"briefed_input_tokens": briefed_in_total,
|
||||
"raw_output_tokens": raw_out_total,
|
||||
"briefed_output_tokens": briefed_out_total,
|
||||
"raw_cost_usd": round(raw_cost, 4),
|
||||
"briefed_cost_usd": round(briefed_cost, 4),
|
||||
"savings_usd": round(raw_cost - briefed_cost, 4),
|
||||
},
|
||||
"comparison_to_v2_estimate": {
|
||||
"v2_modeled_reduction_pct": 42.0,
|
||||
"measured_mean_reduction_pct": round(mean_red, 2),
|
||||
"delta_pct_points": round(mean_red - 42.0, 2),
|
||||
},
|
||||
"results": results,
|
||||
}
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(summary, f, indent=2)
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE — {len(valid)}/{len(docs_meta)} valid pairs in {total_elapsed}s")
|
||||
if skipped:
|
||||
print(f"Skipped (not in pgvector): {len(skipped)}")
|
||||
print(f"Mean input token reduction: {mean_red:.2f}% +/- {ci_half:.2f}% (95% CI)")
|
||||
print(f"V2 modeled estimate: 42.0% | delta: {mean_red - 42.0:+.2f} pts")
|
||||
print(f"Mean output token delta: {mean_out_delta:+.2f}%")
|
||||
print(f"Total cost: ${raw_cost + briefed_cost:.4f}")
|
||||
print(f"Results: {OUTPUT_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user