Files

314 lines
12 KiB
Python

#!/usr/bin/env python3
"""
BirdAI Briefing Generator Test
===============================
Tests the local LLM as a document briefing generator.
The local model produces a structured roadmap for the API —
cleaning, structure detection, signal flagging — without semantic judgment.
Results written to ~/aaronai/briefing_test_results.json
"""
import json
import os
import urllib.request
import urllib.error
import psycopg2
import psycopg2.extras
import hashlib
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/aaronai/.env"))
PG_DSN = os.getenv("PG_DSN")
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json")
MODEL = "mistral"
SAMPLE_SIZE = 50
OLLAMA_URL = "http://localhost:11434/api/generate"
VALID_DOC_TYPES = {
"academic_pdf", "technical_doc", "chat_log", "code",
"presentation", "book_excerpt", "form", "syllabus",
"email", "notes", "unknown"
}
VALID_DENSITIES = {"high", "medium", "low"}
VALID_PRIORITIES = {"full", "partial", "skip"}
BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
Return exactly this structure:
{
"document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown",
"primary_language": "language code e.g. en, fr, de",
"density": "one of: high, medium, low",
"has_proper_nouns": true or false,
"has_dates": true or false,
"has_numeric_data": true or false,
"has_institutional_language": true or false,
"has_technical_terms": true or false,
"likely_has_named_entities": true or false,
"structure_signals": [],
"noise_signals": [],
"extraction_priority": "one of: full, partial, skip"
}
Rules:
- document_type: identify from formatting patterns and vocabulary, not meaning
- density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse
- has_proper_nouns: true if you see capitalized words that are not sentence starts
- has_dates: true if you see date patterns (numbers with months, years, slashes)
- has_numeric_data: true if you see measurements, percentages, statistics
- has_institutional_language: true if you see words like university, department, policy, committee, grant
- has_technical_terms: true if you see domain-specific jargon or acronyms
- likely_has_named_entities: true if has_proper_nouns is true
- structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"]
- noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"]
- extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise
Document:
"""
def get_sample_documents():
if not PG_DSN:
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
conn = psycopg2.connect(PG_DSN)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("""
SELECT DISTINCT ON (source) id, document, source, created_at
FROM embeddings
WHERE length(document) > 100
AND length(document) < 3000
ORDER BY source, random()
LIMIT %s
""", (SAMPLE_SIZE,))
docs = cur.fetchall()
cur.close()
conn.close()
return docs
def run_briefing(text):
prompt = BRIEFING_PROMPT + text[:1500]
payload = json.dumps({
"model": MODEL,
"prompt": prompt,
"stream": False
}).encode()
raw = ""
try:
req = urllib.request.Request(
OLLAMA_URL,
data=payload,
headers={"Content-Type": "application/json"}
)
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read().decode())
raw = result.get("response", "").strip()
start = raw.find("{")
end = raw.rfind("}") + 1
if start == -1 or end == 0:
return None, f"NO_JSON: {raw[:200]}"
json_str = raw[start:end]
parsed = json.loads(json_str)
if not isinstance(parsed, dict):
return None, f"NOT_DICT: {raw[:100]}"
return parsed, raw
except urllib.error.URLError as e:
return None, f"URL_ERROR: {e}"
except TimeoutError:
return None, "TIMEOUT"
except json.JSONDecodeError as e:
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
except Exception as e:
return None, f"ERROR: {type(e).__name__}: {e}"
def sanitize_briefing(briefing):
safe = {}
dt = str(briefing.get("document_type", "unknown")).lower().strip()
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
density = str(briefing.get("density", "medium")).lower().strip()
safe["density"] = density if density in VALID_DENSITIES else "medium"
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
"has_institutional_language", "has_technical_terms",
"likely_has_named_entities"]:
val = briefing.get(field, False)
if isinstance(val, bool):
safe[field] = val
elif isinstance(val, str):
safe[field] = val.lower() in ("true", "yes", "1")
else:
safe[field] = bool(val)
for field in ["structure_signals", "noise_signals"]:
val = briefing.get(field, [])
if isinstance(val, list):
safe[field] = [str(v) for v in val if v]
elif isinstance(val, str):
safe[field] = [val] if val else []
else:
safe[field] = []
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
return safe
def estimate_token_reduction(original_text, briefing):
original_tokens = max(len(original_text) / 4, 1)
orientation_saved = 200
if briefing.get("extraction_priority") == "skip":
return {
"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": round(original_tokens + 200),
"noise_reduction_pct": 100.0,
"total_reduction_pct": 100.0,
"note": "skip — no API call"
}
noise_count = len(briefing.get("noise_signals", []))
noise_reduction_pct = min(noise_count * 0.05, 0.40)
noise_tokens_saved = original_tokens * noise_reduction_pct
total_saved = orientation_saved + noise_tokens_saved
total_cost = original_tokens + 200
reduction_pct = min((total_saved / total_cost) * 100, 99.0)
return {
"original_tokens_approx": round(original_tokens),
"orientation_tokens_saved": orientation_saved,
"noise_tokens_saved": round(noise_tokens_saved),
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
"total_reduction_pct": round(reduction_pct, 1)
}
def format_eta(elapsed_times, completed, total):
if completed == 0:
return "ETA: --:--"
avg = sum(elapsed_times) / completed
remaining = (total - completed) * avg
eta = timedelta(seconds=int(remaining))
return f"ETA: {str(eta)}"
def content_hash(text):
return hashlib.md5(text.encode()).hexdigest()[:8]
def main():
test_start = time.time()
print(f"\nBirdAI Briefing Generator Test")
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Results: {RESULTS_FILE}")
print("-" * 75)
docs = get_sample_documents()
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
results = {
"meta": {
"model": MODEL,
"sample_size": len(docs),
"started": datetime.now().isoformat(),
"completed": None,
"total_elapsed_seconds": None,
"avg_seconds_per_doc": None
},
"documents": [],
"summary": {}
}
success_count = 0
failed_count = 0
priority_counts = {"full": 0, "partial": 0, "skip": 0}
total_reduction_pct = 0.0
elapsed_times = []
for i, doc in enumerate(docs):
doc_id = doc["id"]
content = doc["document"]
source = doc.get("source", "unknown")
chash = content_hash(content)
eta_str = format_eta(elapsed_times, i, len(docs))
print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True)
t_start = time.time()
briefing, raw = run_briefing(content)
elapsed = round(time.time() - t_start, 1)
elapsed_times.append(elapsed)
if briefing is None:
failed_count += 1
print(f"→ FAILED {elapsed}s | {raw[:50]}")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "FAILED",
"error": raw, "elapsed_seconds": elapsed
})
else:
briefing = sanitize_briefing(briefing)
success_count += 1
priority = briefing["extraction_priority"]
doc_type = briefing["document_type"]
density = briefing["density"]
priority_counts[priority] = priority_counts.get(priority, 0) + 1
reduction = estimate_token_reduction(content, briefing)
total_reduction_pct += reduction["total_reduction_pct"]
print(f"{priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
results["documents"].append({
"id": doc_id, "source": source, "content_hash": chash,
"content_length": len(content), "status": "SUCCESS",
"elapsed_seconds": elapsed, "briefing": briefing,
"token_reduction_estimate": reduction
})
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
total_elapsed = round(time.time() - test_start, 1)
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
completed_at = datetime.now().isoformat()
results["meta"]["completed"] = completed_at
results["meta"]["total_elapsed_seconds"] = total_elapsed
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
total = len(docs)
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
summary = {
"total": total,
"success": success_count,
"failed": failed_count,
"success_rate": round(success_count / total * 100, 1),
"extraction_priority_breakdown": priority_counts,
"avg_token_reduction_pct": avg_reduction,
"total_elapsed_seconds": total_elapsed,
"avg_seconds_per_doc": avg_per_doc,
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
"approach_viable": success_count / total >= 0.8
}
results["summary"] = summary
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2, default=str)
print("\n" + "=" * 75)
print(f"RESULTS")
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
print(f" Failed: {failed_count}")
print(f" Priority — full: {priority_counts.get('full', 0)}")
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
print(f" Avg token reduction: {avg_reduction}%")
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
print(f" Avg per document: {avg_per_doc}s")
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
print(f" Completed: {completed_at}")
print(f" Full results: {RESULTS_FILE}")
print("=" * 75)
if __name__ == "__main__":
main()