experiments: add consistency test and briefing generator results + scripts

2026-04-28 02:47:41 +00:00
parent 9937abbe27
commit b6fe350ab2
6 changed files with 6985 additions and 0 deletions
@@ -0,0 +1,313 @@
 #!/usr/bin/env python3
 """
 BirdAI Briefing Generator Test
 ===============================
 Tests the local LLM as a document briefing generator.
 The local model produces a structured roadmap for the API —
 cleaning, structure detection, signal flagging — without semantic judgment.
 Results written to ~/aaronai/briefing_test_results.json
 """
 import json
 import os
 import urllib.request
 import urllib.error
 import psycopg2
 import psycopg2.extras
 import hashlib
 import time
 from datetime import datetime, timedelta
 from dotenv import load_dotenv
 load_dotenv(os.path.expanduser("~/aaronai/.env"))
 PG_DSN = os.getenv("PG_DSN")
 RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json")
 MODEL = "mistral"
 SAMPLE_SIZE = 50
 OLLAMA_URL = "http://localhost:11434/api/generate"
 VALID_DOC_TYPES = {
    "academic_pdf", "technical_doc", "chat_log", "code",
    "presentation", "book_excerpt", "form", "syllabus",
    "email", "notes", "unknown"
 }
 VALID_DENSITIES = {"high", "medium", "low"}
 VALID_PRIORITIES = {"full", "partial", "skip"}
 BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
 Return exactly this structure:
 {
  "document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown",
  "primary_language": "language code e.g. en, fr, de",
  "density": "one of: high, medium, low",
  "has_proper_nouns": true or false,
  "has_dates": true or false,
  "has_numeric_data": true or false,
  "has_institutional_language": true or false,
  "has_technical_terms": true or false,
  "likely_has_named_entities": true or false,
  "structure_signals": [],
  "noise_signals": [],
  "extraction_priority": "one of: full, partial, skip"
 }
 Rules:
 - document_type: identify from formatting patterns and vocabulary, not meaning
 - density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse
 - has_proper_nouns: true if you see capitalized words that are not sentence starts
 - has_dates: true if you see date patterns (numbers with months, years, slashes)
 - has_numeric_data: true if you see measurements, percentages, statistics
 - has_institutional_language: true if you see words like university, department, policy, committee, grant
 - has_technical_terms: true if you see domain-specific jargon or acronyms
 - likely_has_named_entities: true if has_proper_nouns is true
 - structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"]
 - noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"]
 - extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise
 Document:
 """
 def get_sample_documents():
    if not PG_DSN:
        raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
    conn = psycopg2.connect(PG_DSN)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    cur.execute("""
        SELECT DISTINCT ON (source) id, document, source, created_at
        FROM embeddings
        WHERE length(document) > 100
          AND length(document) < 3000
        ORDER BY source, random()
        LIMIT %s
    """, (SAMPLE_SIZE,))
    docs = cur.fetchall()
    cur.close()
    conn.close()
    return docs
 def run_briefing(text):
    prompt = BRIEFING_PROMPT + text[:1500]
    payload = json.dumps({
        "model": MODEL,
        "prompt": prompt,
        "stream": False
    }).encode()
    raw = ""
    try:
        req = urllib.request.Request(
            OLLAMA_URL,
            data=payload,
            headers={"Content-Type": "application/json"}
        )
        with urllib.request.urlopen(req, timeout=180) as resp:
            result = json.loads(resp.read().decode())
        raw = result.get("response", "").strip()
        start = raw.find("{")
        end = raw.rfind("}") + 1
        if start == -1 or end == 0:
            return None, f"NO_JSON: {raw[:200]}"
        json_str = raw[start:end]
        parsed = json.loads(json_str)
        if not isinstance(parsed, dict):
            return None, f"NOT_DICT: {raw[:100]}"
        return parsed, raw
    except urllib.error.URLError as e:
        return None, f"URL_ERROR: {e}"
    except TimeoutError:
        return None, "TIMEOUT"
    except json.JSONDecodeError as e:
        return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
    except Exception as e:
        return None, f"ERROR: {type(e).__name__}: {e}"
 def sanitize_briefing(briefing):
    safe = {}
    dt = str(briefing.get("document_type", "unknown")).lower().strip()
    safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
    safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
    density = str(briefing.get("density", "medium")).lower().strip()
    safe["density"] = density if density in VALID_DENSITIES else "medium"
    for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
                  "has_institutional_language", "has_technical_terms",
                  "likely_has_named_entities"]:
        val = briefing.get(field, False)
        if isinstance(val, bool):
            safe[field] = val
        elif isinstance(val, str):
            safe[field] = val.lower() in ("true", "yes", "1")
        else:
            safe[field] = bool(val)
    for field in ["structure_signals", "noise_signals"]:
        val = briefing.get(field, [])
        if isinstance(val, list):
            safe[field] = [str(v) for v in val if v]
        elif isinstance(val, str):
            safe[field] = [val] if val else []
        else:
            safe[field] = []
    priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
    safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
    return safe
 def estimate_token_reduction(original_text, briefing):
    original_tokens = max(len(original_text) / 4, 1)
    orientation_saved = 200
    if briefing.get("extraction_priority") == "skip":
        return {
            "original_tokens_approx": round(original_tokens),
            "orientation_tokens_saved": round(original_tokens + 200),
            "noise_reduction_pct": 100.0,
            "total_reduction_pct": 100.0,
            "note": "skip — no API call"
        }
    noise_count = len(briefing.get("noise_signals", []))
    noise_reduction_pct = min(noise_count * 0.05, 0.40)
    noise_tokens_saved = original_tokens * noise_reduction_pct
    total_saved = orientation_saved + noise_tokens_saved
    total_cost = original_tokens + 200
    reduction_pct = min((total_saved / total_cost) * 100, 99.0)
    return {
        "original_tokens_approx": round(original_tokens),
        "orientation_tokens_saved": orientation_saved,
        "noise_tokens_saved": round(noise_tokens_saved),
        "noise_reduction_pct": round(noise_reduction_pct * 100, 1),
        "total_reduction_pct": round(reduction_pct, 1)
    }
 def format_eta(elapsed_times, completed, total):
    if completed == 0:
        return "ETA: --:--"
    avg = sum(elapsed_times) / completed
    remaining = (total - completed) * avg
    eta = timedelta(seconds=int(remaining))
    return f"ETA: {str(eta)}"
 def content_hash(text):
    return hashlib.md5(text.encode()).hexdigest()[:8]
 def main():
    test_start = time.time()
    print(f"\nBirdAI Briefing Generator Test")
    print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Results: {RESULTS_FILE}")
    print("-" * 75)
    docs = get_sample_documents()
    print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
    results = {
        "meta": {
            "model": MODEL,
            "sample_size": len(docs),
            "started": datetime.now().isoformat(),
            "completed": None,
            "total_elapsed_seconds": None,
            "avg_seconds_per_doc": None
        },
        "documents": [],
        "summary": {}
    }
    success_count = 0
    failed_count = 0
    priority_counts = {"full": 0, "partial": 0, "skip": 0}
    total_reduction_pct = 0.0
    elapsed_times = []
    for i, doc in enumerate(docs):
        doc_id = doc["id"]
        content = doc["document"]
        source = doc.get("source", "unknown")
        chash = content_hash(content)
        eta_str = format_eta(elapsed_times, i, len(docs))
        print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True)
        t_start = time.time()
        briefing, raw = run_briefing(content)
        elapsed = round(time.time() - t_start, 1)
        elapsed_times.append(elapsed)
        if briefing is None:
            failed_count += 1
            print(f"→ FAILED {elapsed}s | {raw[:50]}")
            results["documents"].append({
                "id": doc_id, "source": source, "content_hash": chash,
                "content_length": len(content), "status": "FAILED",
                "error": raw, "elapsed_seconds": elapsed
            })
        else:
            briefing = sanitize_briefing(briefing)
            success_count += 1
            priority = briefing["extraction_priority"]
            doc_type = briefing["document_type"]
            density = briefing["density"]
            priority_counts[priority] = priority_counts.get(priority, 0) + 1
            reduction = estimate_token_reduction(content, briefing)
            total_reduction_pct += reduction["total_reduction_pct"]
            print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
            results["documents"].append({
                "id": doc_id, "source": source, "content_hash": chash,
                "content_length": len(content), "status": "SUCCESS",
                "elapsed_seconds": elapsed, "briefing": briefing,
                "token_reduction_estimate": reduction
            })
        with open(RESULTS_FILE, "w") as f:
            json.dump(results, f, indent=2, default=str)
    total_elapsed = round(time.time() - test_start, 1)
    avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
    completed_at = datetime.now().isoformat()
    results["meta"]["completed"] = completed_at
    results["meta"]["total_elapsed_seconds"] = total_elapsed
    results["meta"]["avg_seconds_per_doc"] = avg_per_doc
    total = len(docs)
    avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
    summary = {
        "total": total,
        "success": success_count,
        "failed": failed_count,
        "success_rate": round(success_count / total * 100, 1),
        "extraction_priority_breakdown": priority_counts,
        "avg_token_reduction_pct": avg_reduction,
        "total_elapsed_seconds": total_elapsed,
        "avg_seconds_per_doc": avg_per_doc,
        "projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
        "approach_viable": success_count / total >= 0.8
    }
    results["summary"] = summary
    with open(RESULTS_FILE, "w") as f:
        json.dump(results, f, indent=2, default=str)
    print("\n" + "=" * 75)
    print(f"RESULTS")
    print(f"  Success rate:          {success_count}/{total} ({summary['success_rate']}%)")
    print(f"  Failed:                {failed_count}")
    print(f"  Priority — full:       {priority_counts.get('full', 0)}")
    print(f"  Priority — partial:    {priority_counts.get('partial', 0)}")
    print(f"  Priority — skip:       {priority_counts.get('skip', 0)}")
    print(f"  Avg token reduction:   {avg_reduction}%")
    print(f"  Total elapsed:         {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
    print(f"  Avg per document:      {avg_per_doc}s")
    print(f"  Projected 50 docs:     {summary['projected_50_doc_minutes']} min")
    print(f"  Approach viable:       {'YES' if summary['approach_viable'] else 'NO'}")
    print(f"  Completed:             {completed_at}")
    print(f"  Full results:          {RESULTS_FILE}")
    print("=" * 75)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,248 @@
 #!/usr/bin/env python3
 """
 BirdAI Cascaded Extraction — Consistency Test
 """
 import json
 import os
 import urllib.request
 import urllib.error
 import psycopg2
 import psycopg2.extras
 import hashlib
 import time
 from datetime import datetime
 from dotenv import load_dotenv
 load_dotenv(os.path.expanduser("~/aaronai/.env"))
 PG_DSN = os.getenv("PG_DSN")
 RESULTS_FILE = os.path.expanduser("~/aaronai/consistency_test_results.json")
 MODEL = "mistral"
 PASSES = 3
 SAMPLE_SIZE = 50
 OLLAMA_URL = "http://localhost:11434/api/generate"
 EXTRACTION_PROMPT = """Extract named entities from this text. Return JSON only, no explanation, no prose.
 Use exactly these fields (omit any field you are uncertain about, use empty list if none found):
 {
  "people": [],
  "organizations": [],
  "locations": [],
  "dates": [],
  "document_type": ""
 }
 Rules:
 - Every value in people, organizations, locations, dates must be a plain string
 - document_type must be a plain string
 - No nested objects, no nested lists
 - Only include entities you are certain about
 - If uncertain about anything, omit it
 Text: """
 def get_sample_documents():
    conn = psycopg2.connect(PG_DSN)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    cur.execute("""
        SELECT id, document, source, created_at
        FROM embeddings
        WHERE length(document) > 100
          AND length(document) < 3000
        ORDER BY random()
        LIMIT %s
    """, (SAMPLE_SIZE,))
    docs = cur.fetchall()
    cur.close()
    conn.close()
    return docs
 def run_extraction(text):
    prompt = EXTRACTION_PROMPT + text[:1500]
    payload = json.dumps({
        "model": MODEL,
        "prompt": prompt,
        "stream": False
    }).encode()
    try:
        req = urllib.request.Request(
            OLLAMA_URL,
            data=payload,
            headers={"Content-Type": "application/json"}
        )
        with urllib.request.urlopen(req, timeout=180) as resp:
            result = json.loads(resp.read().decode())
        raw = result.get("response", "").strip()
        start = raw.find("{")
        end = raw.rfind("}") + 1
        if start == -1 or end == 0:
            return None, f"NO_JSON: {raw[:100]}"
        json_str = raw[start:end]
        parsed = json.loads(json_str)
        if not isinstance(parsed, dict):
            return None, f"NOT_DICT: {json_str[:100]}"
        return parsed, raw
    except urllib.error.URLError as e:
        return None, f"URL_ERROR: {e}"
    except TimeoutError:
        return None, "TIMEOUT"
    except json.JSONDecodeError as e:
        return None, f"JSON_ERROR: {e}"
    except Exception as e:
        return None, f"ERROR: {type(e).__name__}: {e}"
 def flatten_value(v):
    if isinstance(v, str):
        return v.lower().strip()
    elif isinstance(v, dict):
        return json.dumps(v, sort_keys=True).lower()
    elif isinstance(v, list):
        return json.dumps(sorted([flatten_value(i) for i in v]))
    else:
        return str(v).lower().strip()
 def normalize_extraction(extracted):
    if extracted is None:
        return None
    normalized = {}
    expected_fields = ["people", "organizations", "locations", "dates", "document_type"]
    for key in expected_fields:
        val = extracted.get(key, [] if key != "document_type" else "")
        if isinstance(val, list):
            normalized[key] = sorted([flatten_value(v) for v in val])
        else:
            normalized[key] = flatten_value(val)
    return normalized
 def extractions_consistent(extractions):
    if any(e is None for e in extractions):
        return False
    normalized = [normalize_extraction(e) for e in extractions]
    if any(n is None for n in normalized):
        return False
    return all(n == normalized[0] for n in normalized[1:])
 def content_hash(text):
    return hashlib.md5(text.encode()).hexdigest()[:8]
 def main():
    print(f"\nBirdAI Consistency Test")
    print(f"Model: {MODEL} | Passes: {PASSES} | Sample: {SAMPLE_SIZE} docs")
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Results: {RESULTS_FILE}")
    print("-" * 60)
    docs = get_sample_documents()
    print(f"Loaded {len(docs)} documents from pgvector\n")
    results = {
        "meta": {
            "model": MODEL,
            "passes": PASSES,
            "sample_size": len(docs),
            "started": datetime.now().isoformat(),
            "completed": None
        },
        "documents": [],
        "summary": {}
    }
    consistent_count = 0
    failed_count = 0
    timeout_count = 0
    for i, doc in enumerate(docs):
        doc_id = doc["id"]
        content = doc["document"]
        source = doc.get("source", "unknown")
        chash = content_hash(content)
        print(f"[{i+1:02d}/{len(docs)}] {source[:50]:<50} hash:{chash}", end=" ", flush=True)
        passes = []
        pass_times = []
        raw_outputs = []
        for p in range(PASSES):
            t_start = time.time()
            extracted, raw = run_extraction(content)
            t_end = time.time()
            passes.append(extracted)
            pass_times.append(round(t_end - t_start, 1))
            raw_outputs.append(raw[:200] if raw else "")
        consistent = extractions_consistent(passes)
        any_timeout = any("TIMEOUT" in str(r) for r in raw_outputs)
        any_failed = any(p is None for p in passes)
        if any_timeout:
            timeout_count += 1
            status = "TIMEOUT"
        elif any_failed:
            failed_count += 1
            status = "FAILED"
        elif consistent:
            consistent_count += 1
            status = "CONSISTENT"
        else:
            status = "INCONSISTENT"
        print(f"→ {status} ({'/'.join(str(t) for t in pass_times)}s)")
        try:
            sample_extraction = normalize_extraction(passes[0]) if passes[0] else None
        except Exception:
            sample_extraction = None
        results["documents"].append({
            "id": doc_id,
            "source": source,
            "content_hash": chash,
            "content_length": len(content),
            "status": status,
            "consistent": consistent,
            "pass_times_seconds": pass_times,
            "extraction_sample": sample_extraction,
            "raw_samples": raw_outputs
        })
        with open(RESULTS_FILE, "w") as f:
            json.dump(results, f, indent=2, default=str)
    total = len(docs)
    completed_at = datetime.now().isoformat()
    results["meta"]["completed"] = completed_at
    summary = {
        "total": total,
        "consistent": consistent_count,
        "inconsistent": total - consistent_count - failed_count - timeout_count,
        "failed": failed_count,
        "timeout": timeout_count,
        "consistency_rate": round(consistent_count / total * 100, 1),
        "cascade_viable": consistent_count / total >= 0.5
    }
    results["summary"] = summary
    with open(RESULTS_FILE, "w") as f:
        json.dump(results, f, indent=2, default=str)
    print("\n" + "=" * 60)
    print(f"RESULTS")
    print(f"  Consistent:       {consistent_count}/{total} ({summary['consistency_rate']}%)")
    print(f"  Inconsistent:     {summary['inconsistent']}")
    print(f"  Failed/Timeout:   {failed_count + timeout_count}")
    print(f"  Cascade viable:   {'YES' if summary['cascade_viable'] else 'NO — reconsider architecture'}")
    print(f"  Completed:        {completed_at}")
    print(f"  Full results:     {RESULTS_FILE}")
    print("=" * 60)
 if __name__ == "__main__":
    main()