experiments: add consistency test and briefing generator results + scripts

This commit is contained in:
2026-04-28 02:47:41 +00:00
parent 9937abbe27
commit b6fe350ab2
6 changed files with 6985 additions and 0 deletions
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+313
View File
@@ -0,0 +1,313 @@
#!/usr/bin/env python3
"""
BirdAI Briefing Generator Test
===============================
Tests the local LLM as a document briefing generator.
The local model produces a structured roadmap for the API —
cleaning, structure detection, signal flagging — without semantic judgment.
Results written to ~/aaronai/briefing_test_results.json
"""
import json
import os
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json")
MODEL = "mistral"
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
VALID_DOC_TYPES = {
"academic_pdf", "technical_doc", "chat_log", "code",
"presentation", "book_excerpt", "form", "syllabus",
"email", "notes", "unknown"
}
VALID_DENSITIES = {"high", "medium", "low"}
VALID_PRIORITIES = {"full", "partial", "skip"}
BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
Return exactly this structure:
{
"document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown",
"primary_language": "language code e.g. en, fr, de",
"density": "one of: high, medium, low",
"has_proper_nouns": true or false,
"has_dates": true or false,
"has_numeric_data": true or false,
"has_institutional_language": true or false,
"has_technical_terms": true or false,
"likely_has_named_entities": true or false,
"structure_signals": [],
"noise_signals": [],
"extraction_priority": "one of: full, partial, skip"
}
Rules:
- document_type: identify from formatting patterns and vocabulary, not meaning
- density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse
- has_proper_nouns: true if you see capitalized words that are not sentence starts
- has_dates: true if you see date patterns (numbers with months, years, slashes)
- has_numeric_data: true if you see measurements, percentages, statistics
- has_institutional_language: true if you see words like university, department, policy, committee, grant
- has_technical_terms: true if you see domain-specific jargon or acronyms
- likely_has_named_entities: true if has_proper_nouns is true
- structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"]
- noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"]
- extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise
Document:
"""
def get_sample_documents():
if not PG_DSN:
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT DISTINCT ON (source) id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY source, random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_briefing(text):
prompt = BRIEFING_PROMPT + text[:1500]
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False
}).encode()
raw = ""
try:
req = urllib.request.Request(
OLLAMA_URL,
data=payload,
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:200]}"
json_str = raw[start:end]
parsed = json.loads(json_str)
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {raw[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def sanitize_briefing(briefing):
safe = {}
dt = str(briefing.get("document_type", "unknown")).lower().strip()
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
density = str(briefing.get("density", "medium")).lower().strip()
safe["density"] = density if density in VALID_DENSITIES else "medium"
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
"has_institutional_language", "has_technical_terms",
"likely_has_named_entities"]:
val = briefing.get(field, False)
if isinstance(val, bool):
safe[field] = val
elif isinstance(val, str):
safe[field] = val.lower() in ("true", "yes", "1")
else:
safe[field] = bool(val)
for field in ["structure_signals", "noise_signals"]:
val = briefing.get(field, [])
if isinstance(val, list):
safe[field] = [str(v) for v in val if v]
elif isinstance(val, str):
safe[field] = [val] if val else []
else:
safe[field] = []
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
return safe
def estimate_token_reduction(original_text, briefing):
original_tokens = max(len(original_text) / 4, 1)
orientation_saved = 200
if briefing.get("extraction_priority") == "skip":
return {
"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": round(original_tokens + 200),
"noise_reduction_pct": 100.0,
"total_reduction_pct": 100.0,
"note": "skip — no API call"
}
noise_count = len(briefing.get("noise_signals", []))
noise_reduction_pct = min(noise_count * 0.05, 0.40)
noise_tokens_saved = original_tokens * noise_reduction_pct
total_saved = orientation_saved + noise_tokens_saved
total_cost = original_tokens + 200
reduction_pct = min((total_saved / total_cost) * 100, 99.0)
return {
"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": orientation_saved,
"noise_tokens_saved": round(noise_tokens_saved),
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
"total_reduction_pct": round(reduction_pct, 1)
}
def format_eta(elapsed_times, completed, total):
if completed == 0:
return "ETA: --:--"
avg = sum(elapsed_times) / completed
remaining = (total - completed) * avg
eta = timedelta(seconds=int(remaining))
return f"ETA: {str(eta)}"
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
test_start = time.time()
print(f"\nBirdAI Briefing Generator Test")
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 75)
docs = get_sample_documents()
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
results = {
"meta": {
"model": MODEL,
"sample_size": len(docs),
"started": datetime.now().isoformat(),
"completed": None,
"total_elapsed_seconds": None,
"avg_seconds_per_doc": None
},
"documents": [],
"summary": {}
}
success_count = 0
failed_count = 0
priority_counts = {"full": 0, "partial": 0, "skip": 0}
total_reduction_pct = 0.0
elapsed_times = []
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
eta_str = format_eta(elapsed_times, i, len(docs))
print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True)
t_start = time.time()
briefing, raw = run_briefing(content)
elapsed = round(time.time() - t_start, 1)
elapsed_times.append(elapsed)
if briefing is None:
failed_count += 1
print(f"→ FAILED {elapsed}s | {raw[:50]}")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "FAILED",
"error": raw, "elapsed_seconds": elapsed
})
else:
briefing = sanitize_briefing(briefing)
success_count += 1
priority = briefing["extraction_priority"]
doc_type = briefing["document_type"]
density = briefing["density"]
priority_counts[priority] = priority_counts.get(priority, 0) + 1
reduction = estimate_token_reduction(content, briefing)
total_reduction_pct += reduction["total_reduction_pct"]
print(f"{priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "SUCCESS",
"elapsed_seconds": elapsed, "briefing": briefing,
"token_reduction_estimate": reduction
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total_elapsed = round(time.time() - test_start, 1)
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
results["meta"]["total_elapsed_seconds"] = total_elapsed
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
total = len(docs)
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
summary = {
"total": total,
"success": success_count,
"failed": failed_count,
"success_rate": round(success_count / total * 100, 1),
"extraction_priority_breakdown": priority_counts,
"avg_token_reduction_pct": avg_reduction,
"total_elapsed_seconds": total_elapsed,
"avg_seconds_per_doc": avg_per_doc,
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
"approach_viable": success_count / total >= 0.8
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 75)
print(f"RESULTS")
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
print(f" Failed: {failed_count}")
print(f" Priority — full: {priority_counts.get('full', 0)}")
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
print(f" Avg token reduction: {avg_reduction}%")
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
print(f" Avg per document: {avg_per_doc}s")
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 75)
if __name__ == "__main__":
main()
+248
View File
@@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
BirdAI Cascaded Extraction — Consistency Test
"""
import json
import os
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/consistency_test_results.json")
MODEL = "mistral"
PASSES = 3
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
EXTRACTION_PROMPT = """Extract named entities from this text. Return JSON only, no explanation, no prose.
Use exactly these fields (omit any field you are uncertain about, use empty list if none found):
{
"people": [],
"organizations": [],
"locations": [],
"dates": [],
"document_type": ""
}
Rules:
- Every value in people, organizations, locations, dates must be a plain string
- document_type must be a plain string
- No nested objects, no nested lists
- Only include entities you are certain about
- If uncertain about anything, omit it
Text: """
def get_sample_documents():
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_extraction(text):
prompt = EXTRACTION_PROMPT + text[:1500]
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False
}).encode()
try:
req = urllib.request.Request(
OLLAMA_URL,
data=payload,
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:100]}"
json_str = raw[start:end]
parsed = json.loads(json_str)
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {json_str[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def flatten_value(v):
if isinstance(v, str):
return v.lower().strip()
elif isinstance(v, dict):
return json.dumps(v, sort_keys=True).lower()
elif isinstance(v, list):
return json.dumps(sorted([flatten_value(i) for i in v]))
else:
return str(v).lower().strip()
def normalize_extraction(extracted):
if extracted is None:
return None
normalized = {}
expected_fields = ["people", "organizations", "locations", "dates", "document_type"]
for key in expected_fields:
val = extracted.get(key, [] if key != "document_type" else "")
if isinstance(val, list):
normalized[key] = sorted([flatten_value(v) for v in val])
else:
normalized[key] = flatten_value(val)
return normalized
def extractions_consistent(extractions):
if any(e is None for e in extractions):
return False
normalized = [normalize_extraction(e) for e in extractions]
if any(n is None for n in normalized):
return False
return all(n == normalized[0] for n in normalized[1:])
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
print(f"\nBirdAI Consistency Test")
print(f"Model: {MODEL} | Passes: {PASSES} | Sample: {SAMPLE_SIZE} docs")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 60)
docs = get_sample_documents()
print(f"Loaded {len(docs)} documents from pgvector\n")
results = {
"meta": {
"model": MODEL,
"passes": PASSES,
"sample_size": len(docs),
"started": datetime.now().isoformat(),
"completed": None
},
"documents": [],
"summary": {}
}
consistent_count = 0
failed_count = 0
timeout_count = 0
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
print(f"[{i+1:02d}/{len(docs)}] {source[:50]:<50} hash:{chash}", end=" ", flush=True)
passes = []
pass_times = []
raw_outputs = []
for p in range(PASSES):
t_start = time.time()
extracted, raw = run_extraction(content)
t_end = time.time()
passes.append(extracted)
pass_times.append(round(t_end - t_start, 1))
raw_outputs.append(raw[:200] if raw else "")
consistent = extractions_consistent(passes)
any_timeout = any("TIMEOUT" in str(r) for r in raw_outputs)
any_failed = any(p is None for p in passes)
if any_timeout:
timeout_count += 1
status = "TIMEOUT"
elif any_failed:
failed_count += 1
status = "FAILED"
elif consistent:
consistent_count += 1
status = "CONSISTENT"
else:
status = "INCONSISTENT"
print(f"{status} ({'/'.join(str(t) for t in pass_times)}s)")
try:
sample_extraction = normalize_extraction(passes[0]) if passes[0] else None
except Exception:
sample_extraction = None
results["documents"].append({
"id": doc_id,
"source": source,
"content_hash": chash,
"content_length": len(content),
"status": status,
"consistent": consistent,
"pass_times_seconds": pass_times,
"extraction_sample": sample_extraction,
"raw_samples": raw_outputs
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total = len(docs)
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
summary = {
"total": total,
"consistent": consistent_count,
"inconsistent": total - consistent_count - failed_count - timeout_count,
"failed": failed_count,
"timeout": timeout_count,
"consistency_rate": round(consistent_count / total * 100, 1),
"cascade_viable": consistent_count / total >= 0.5
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 60)
print(f"RESULTS")
print(f" Consistent: {consistent_count}/{total} ({summary['consistency_rate']}%)")
print(f" Inconsistent: {summary['inconsistent']}")
print(f" Failed/Timeout: {failed_count + timeout_count}")
print(f" Cascade viable: {'YES' if summary['cascade_viable'] else 'NO — reconsider architecture'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 60)
if __name__ == "__main__":
main()