#!/usr/bin/env python3 """ BirdAI Briefing Generator v2 — Experiment 002b =============================================== Changes from v1 (based on Experiment 004 human evaluation): - document_type now pre-classified by rule, not by model - Capture template header stripped before model sees content - noise_signals constrained to controlled vocabulary - Model prompt simplified — focuses only on reliable signal fields - Expanded document type vocabulary for BirdAI-specific types Results written to ~/aaronai/briefing_test_v2_results.json """ import json import os import re import urllib.request import urllib.error import psycopg2 import psycopg2.extras import hashlib import time from datetime import datetime, timedelta from dotenv import load_dotenv load_dotenv(os.path.expanduser("~/aaronai/.env")) PG_DSN = os.getenv("PG_DSN") RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_v2_results.json") MODEL = "mistral" SAMPLE_SIZE = 50 OLLAMA_URL = "http://localhost:11434/api/generate" VALID_DOC_TYPES = { "voice_capture", "image_capture", "dream_nrem", "dream_rem", "dream_lucid", "dream_synthesis", "presentation", "code", "spreadsheet", "academic_pdf", "technical_doc", "chat_log", "book_excerpt", "form", "syllabus", "email", "notes", "purchase_order", "annual_report", "invoice", "memo", "report", "unknown" } VALID_DENSITIES = {"high", "medium", "low"} VALID_PRIORITIES = {"full", "partial", "skip"} VALID_NOISE_SIGNALS = { "repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate", "watermarks", "footers", "line_numbers", "encoding_artifacts", "ocr_errors" } VALID_STRUCTURE_SIGNALS = { "headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations", "footnotes", "images", "forms", "columns", "sections" } def pre_classify_document(source, content): filename = os.path.basename(source).lower() doc_type = None cleaned_content = content if "---" in content: parts = content.split("---", 1) header = parts[0].lower() body = parts[1].strip() if len(parts) > 1 else content if any(marker in header for marker in ["**type:**", "**modality:**", "# capture", "# dream"]): cleaned_content = body if body else content if "nrem" in filename: doc_type = "dream_nrem" elif "lucid" in filename: doc_type = "dream_lucid" elif "-rem-" in filename or filename.endswith("-rem.md"): doc_type = "dream_rem" elif "synthesis" in filename and filename.endswith(".md"): doc_type = "dream_synthesis" elif "-voice" in filename or "voice-" in filename: doc_type = "voice_capture" elif "-image" in filename or "image-" in filename: doc_type = "image_capture" elif filename.endswith(".pptx") or filename.endswith(".ppt"): doc_type = "presentation" elif filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".csv"): doc_type = "spreadsheet" elif any(filename.endswith(ext) for ext in [".py", ".js", ".ts", ".cpp", ".c", ".h", ".java", ".rs"]): doc_type = "code" elif filename.endswith("cmakelists.txt") or filename == "makefile": doc_type = "code" elif content.startswith("# Dream"): if "nrem" in content[:50].lower(): doc_type = "dream_nrem" elif "lucid" in content[:50].lower(): doc_type = "dream_lucid" elif "rem" in content[:50].lower(): doc_type = "dream_rem" else: doc_type = "dream_synthesis" elif content.startswith("# Capture"): doc_type = "voice_capture" if "voice" in content[:100].lower() else "image_capture" return doc_type, cleaned_content def build_briefing_prompt(content, pre_classified_type=None): if pre_classified_type: type_instruction = f'\n "document_type": "{pre_classified_type}", // pre-classified, do not change' else: type_instruction = '\n "document_type": "one of: academic_pdf, technical_doc, chat_log, book_excerpt, form, syllabus, email, notes, purchase_order, annual_report, invoice, memo, report, unknown",' return f"""Analyze this document and return a JSON briefing. No explanation, no prose, JSON only. Return exactly this structure: {{{type_instruction} "primary_language": "language code e.g. en, fr, de", "density": "one of: high, medium, low", "has_proper_nouns": true or false, "has_dates": true or false, "has_numeric_data": true or false, "has_institutional_language": true or false, "has_technical_terms": true or false, "likely_has_named_entities": true or false, "structure_signals": [], "noise_signals": [], "extraction_priority": "one of: full, partial, skip" }} Rules: - density: high=information dense technical or academic, medium=mixed, low=narrative/literary/sparse/short - has_proper_nouns: true if you see capitalized words that are NOT sentence starts or template headers - has_dates: true if you see date patterns (numbers with months, years, slashes) - has_numeric_data: true if you see measurements, percentages, statistics - has_institutional_language: true if you see words like university, department, policy, committee, grant - has_technical_terms: true if you see domain-specific jargon or acronyms - likely_has_named_entities: true if has_proper_nouns is true - structure_signals: use ONLY these terms: headings, bullet_lists, numbered_lists, tables, code_blocks, citations, footnotes, images, forms, columns, sections - noise_signals: use ONLY these terms: repeated_headers, page_numbers, formatting_artifacts, boilerplate, watermarks, footers, line_numbers, encoding_artifacts, ocr_errors - extraction_priority: full if density=high and likely_has_named_entities=true; skip if density=low AND likely_has_named_entities=false AND content is under 200 words; partial otherwise Document: {content[:1500]}""" def get_sample_documents(): if not PG_DSN: raise RuntimeError("PG_DSN not found in .env — cannot connect to database") conn = psycopg2.connect(PG_DSN) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute(""" SELECT DISTINCT ON (source) id, document, source, created_at FROM embeddings WHERE length(document) > 100 AND length(document) < 3000 ORDER BY source, random() LIMIT %s """, (SAMPLE_SIZE,)) docs = cur.fetchall() cur.close() conn.close() return docs def run_briefing(prompt): payload = json.dumps({"model": MODEL, "prompt": prompt, "stream": False}).encode() raw = "" try: req = urllib.request.Request(OLLAMA_URL, data=payload, headers={"Content-Type": "application/json"}) with urllib.request.urlopen(req, timeout=180) as resp: result = json.loads(resp.read().decode()) raw = result.get("response", "").strip() start = raw.find("{") end = raw.rfind("}") + 1 if start == -1 or end == 0: return None, f"NO_JSON: {raw[:200]}" parsed = json.loads(raw[start:end]) if not isinstance(parsed, dict): return None, f"NOT_DICT: {raw[:100]}" return parsed, raw except urllib.error.URLError as e: return None, f"URL_ERROR: {e}" except TimeoutError: return None, "TIMEOUT" except json.JSONDecodeError as e: return None, f"JSON_ERROR: {e} | raw: {raw[:200]}" except Exception as e: return None, f"ERROR: {type(e).__name__}: {e}" def sanitize_briefing(briefing, pre_classified_type=None): safe = {} if pre_classified_type: safe["document_type"] = pre_classified_type else: dt = str(briefing.get("document_type", "unknown")).lower().strip() safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown" safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10] density = str(briefing.get("density", "medium")).lower().strip() safe["density"] = density if density in VALID_DENSITIES else "medium" for field in ["has_proper_nouns", "has_dates", "has_numeric_data", "has_institutional_language", "has_technical_terms", "likely_has_named_entities"]: val = briefing.get(field, False) if isinstance(val, bool): safe[field] = val elif isinstance(val, str): safe[field] = val.lower() in ("true", "yes", "1") else: safe[field] = bool(val) for field, valid_set in [("structure_signals", VALID_STRUCTURE_SIGNALS), ("noise_signals", VALID_NOISE_SIGNALS)]: val = briefing.get(field, []) if isinstance(val, list): safe[field] = [str(v).lower().strip() for v in val if str(v).lower().strip() in valid_set] elif isinstance(val, str) and val.lower().strip() in valid_set: safe[field] = [val.lower().strip()] else: safe[field] = [] priority = str(briefing.get("extraction_priority", "partial")).lower().strip() safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial" return safe def estimate_token_reduction(original_text, briefing): original_tokens = max(len(original_text) / 4, 1) orientation_saved = 200 if briefing.get("extraction_priority") == "skip": return {"original_tokens_approx": round(original_tokens), "orientation_tokens_saved": round(original_tokens + 200), "noise_reduction_pct": 100.0, "total_reduction_pct": 100.0, "note": "skip — no API call"} noise_count = len(briefing.get("noise_signals", [])) noise_reduction_pct = min(noise_count * 0.05, 0.40) noise_tokens_saved = original_tokens * noise_reduction_pct total_saved = orientation_saved + noise_tokens_saved reduction_pct = min((total_saved / (original_tokens + 200)) * 100, 99.0) return {"original_tokens_approx": round(original_tokens), "orientation_tokens_saved": orientation_saved, "noise_tokens_saved": round(noise_tokens_saved), "noise_reduction_pct": round(noise_reduction_pct * 100, 1), "total_reduction_pct": round(reduction_pct, 1)} def format_eta(elapsed_times, completed, total): if completed == 0: return "ETA: --:--" avg = sum(elapsed_times) / completed eta = timedelta(seconds=int((total - completed) * avg)) return f"ETA: {str(eta)}" def content_hash(text): return hashlib.md5(text.encode()).hexdigest()[:8] def main(): test_start = time.time() print(f"\nBirdAI Briefing Generator v2 — Experiment 002b") print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)") print(f"Changes: rule-based doc_type, template stripping, controlled vocab") print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"Results: {RESULTS_FILE}") print("-" * 75) docs = get_sample_documents() print(f"Loaded {len(docs)} distinct source documents from pgvector\n") results = { "meta": {"model": MODEL, "version": "v2", "sample_size": len(docs), "started": datetime.now().isoformat(), "completed": None, "total_elapsed_seconds": None, "avg_seconds_per_doc": None}, "documents": [], "summary": {} } success_count = 0 failed_count = 0 pre_classified_count = 0 priority_counts = {"full": 0, "partial": 0, "skip": 0} total_reduction_pct = 0.0 elapsed_times = [] for i, doc in enumerate(docs): doc_id = doc["id"] content = doc["document"] source = doc.get("source", "unknown") chash = content_hash(content) pre_type, cleaned_content = pre_classify_document(source, content) was_pre_classified = pre_type is not None if was_pre_classified: pre_classified_count += 1 eta_str = format_eta(elapsed_times, i, len(docs)) pre_flag = "R" if was_pre_classified else "M" print(f"[{i+1:02d}/{len(docs)}][{pre_flag}] {source[:36]:<36} {eta_str:<14}", end=" ", flush=True) prompt = build_briefing_prompt(cleaned_content, pre_type) t_start = time.time() briefing, raw = run_briefing(prompt) elapsed = round(time.time() - t_start, 1) elapsed_times.append(elapsed) if briefing is None: failed_count += 1 print(f"→ FAILED {elapsed}s | {raw[:50]}") results["documents"].append({ "id": doc_id, "source": source, "content_hash": chash, "content_length": len(content), "status": "FAILED", "pre_classified_type": pre_type, "error": raw, "elapsed_seconds": elapsed }) else: briefing = sanitize_briefing(briefing, pre_type) success_count += 1 priority = briefing["extraction_priority"] doc_type = briefing["document_type"] density = briefing["density"] priority_counts[priority] = priority_counts.get(priority, 0) + 1 reduction = estimate_token_reduction(cleaned_content, briefing) total_reduction_pct += reduction["total_reduction_pct"] print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s") results["documents"].append({ "id": doc_id, "source": source, "content_hash": chash, "content_length": len(content), "cleaned_content_length": len(cleaned_content), "status": "SUCCESS", "pre_classified_type": pre_type, "was_pre_classified": was_pre_classified, "elapsed_seconds": elapsed, "briefing": briefing, "token_reduction_estimate": reduction }) with open(RESULTS_FILE, "w") as f: json.dump(results, f, indent=2, default=str) total_elapsed = round(time.time() - test_start, 1) avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0 completed_at = datetime.now().isoformat() results["meta"]["completed"] = completed_at results["meta"]["total_elapsed_seconds"] = total_elapsed results["meta"]["avg_seconds_per_doc"] = avg_per_doc total = len(docs) avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0 summary = { "total": total, "success": success_count, "failed": failed_count, "success_rate": round(success_count / total * 100, 1), "pre_classified_by_rule": pre_classified_count, "classified_by_model": total - pre_classified_count, "extraction_priority_breakdown": priority_counts, "avg_token_reduction_pct": avg_reduction, "total_elapsed_seconds": total_elapsed, "avg_seconds_per_doc": avg_per_doc, "projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1), "approach_viable": success_count / total >= 0.8 } results["summary"] = summary with open(RESULTS_FILE, "w") as f: json.dump(results, f, indent=2, default=str) print("\n" + "=" * 75) print(f"RESULTS — Briefing Generator v2") print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)") print(f" Failed: {failed_count}") print(f" Pre-classified (rule): {pre_classified_count}") print(f" Classified (model): {total - pre_classified_count}") print(f" Priority — full: {priority_counts.get('full', 0)}") print(f" Priority — partial: {priority_counts.get('partial', 0)}") print(f" Priority — skip: {priority_counts.get('skip', 0)}") print(f" Avg token reduction: {avg_reduction}%") print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)") print(f" Avg per document: {avg_per_doc}s") print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min") print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}") print(f" Completed: {completed_at}") print(f" Full results: {RESULTS_FILE}") print("=" * 75) if __name__ == "__main__": main()