scripts/: separate production from experimental and deprecated
Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2, base_class, cascade, cost_test, briefing, consistency, token series). Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py, tier1_migration.py — under the bespoke decision both target retired substrate work). Removes 19 .bak* files from disk (gitignored, never tracked; git history is the durable record of every prior version). The 11 production scripts remain in scripts/. All systemd ExecStart paths, api.py subprocess calls, and cron jobs continue to resolve correctly — verified by grep against /etc/systemd/system/aaronai-*.service, scripts/ references in api.py, and the user crontab. Track 1 inventory cross-cutting finding: scripts/ mixed 11 production files with 32 experimental scripts and ~20 .bak files. After this commit a clean-room reader can identify the live workers from a directory listing alone. Found by Track 1 inventory 2026-05-02. See ~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning. After commit, run: 1. git log --oneline -3 — show the new commit on top 2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
This commit is contained in:
@@ -0,0 +1,376 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BirdAI Briefing Generator v2 — Experiment 002b
|
||||
===============================================
|
||||
Changes from v1 (based on Experiment 004 human evaluation):
|
||||
- document_type now pre-classified by rule, not by model
|
||||
- Capture template header stripped before model sees content
|
||||
- noise_signals constrained to controlled vocabulary
|
||||
- Model prompt simplified — focuses only on reliable signal fields
|
||||
- Expanded document type vocabulary for BirdAI-specific types
|
||||
Results written to ~/aaronai/briefing_test_v2_results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import hashlib
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_v2_results.json")
|
||||
MODEL = "mistral"
|
||||
SAMPLE_SIZE = 50
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
|
||||
VALID_DOC_TYPES = {
|
||||
"voice_capture", "image_capture",
|
||||
"dream_nrem", "dream_rem", "dream_lucid", "dream_synthesis",
|
||||
"presentation", "code", "spreadsheet",
|
||||
"academic_pdf", "technical_doc", "chat_log",
|
||||
"book_excerpt", "form", "syllabus", "email",
|
||||
"notes", "purchase_order", "annual_report",
|
||||
"invoice", "memo", "report", "unknown"
|
||||
}
|
||||
|
||||
VALID_DENSITIES = {"high", "medium", "low"}
|
||||
VALID_PRIORITIES = {"full", "partial", "skip"}
|
||||
|
||||
VALID_NOISE_SIGNALS = {
|
||||
"repeated_headers", "page_numbers", "formatting_artifacts",
|
||||
"boilerplate", "watermarks", "footers", "line_numbers",
|
||||
"encoding_artifacts", "ocr_errors"
|
||||
}
|
||||
|
||||
VALID_STRUCTURE_SIGNALS = {
|
||||
"headings", "bullet_lists", "numbered_lists", "tables",
|
||||
"code_blocks", "citations", "footnotes", "images",
|
||||
"forms", "columns", "sections"
|
||||
}
|
||||
|
||||
|
||||
def pre_classify_document(source, content):
|
||||
filename = os.path.basename(source).lower()
|
||||
doc_type = None
|
||||
cleaned_content = content
|
||||
|
||||
if "---" in content:
|
||||
parts = content.split("---", 1)
|
||||
header = parts[0].lower()
|
||||
body = parts[1].strip() if len(parts) > 1 else content
|
||||
if any(marker in header for marker in ["**type:**", "**modality:**", "# capture", "# dream"]):
|
||||
cleaned_content = body if body else content
|
||||
|
||||
if "nrem" in filename:
|
||||
doc_type = "dream_nrem"
|
||||
elif "lucid" in filename:
|
||||
doc_type = "dream_lucid"
|
||||
elif "-rem-" in filename or filename.endswith("-rem.md"):
|
||||
doc_type = "dream_rem"
|
||||
elif "synthesis" in filename and filename.endswith(".md"):
|
||||
doc_type = "dream_synthesis"
|
||||
elif "-voice" in filename or "voice-" in filename:
|
||||
doc_type = "voice_capture"
|
||||
elif "-image" in filename or "image-" in filename:
|
||||
doc_type = "image_capture"
|
||||
elif filename.endswith(".pptx") or filename.endswith(".ppt"):
|
||||
doc_type = "presentation"
|
||||
elif filename.endswith(".xlsx") or filename.endswith(".xls") or filename.endswith(".csv"):
|
||||
doc_type = "spreadsheet"
|
||||
elif any(filename.endswith(ext) for ext in [".py", ".js", ".ts", ".cpp", ".c", ".h", ".java", ".rs"]):
|
||||
doc_type = "code"
|
||||
elif filename.endswith("cmakelists.txt") or filename == "makefile":
|
||||
doc_type = "code"
|
||||
elif content.startswith("# Dream"):
|
||||
if "nrem" in content[:50].lower():
|
||||
doc_type = "dream_nrem"
|
||||
elif "lucid" in content[:50].lower():
|
||||
doc_type = "dream_lucid"
|
||||
elif "rem" in content[:50].lower():
|
||||
doc_type = "dream_rem"
|
||||
else:
|
||||
doc_type = "dream_synthesis"
|
||||
elif content.startswith("# Capture"):
|
||||
doc_type = "voice_capture" if "voice" in content[:100].lower() else "image_capture"
|
||||
|
||||
return doc_type, cleaned_content
|
||||
|
||||
|
||||
def build_briefing_prompt(content, pre_classified_type=None):
|
||||
if pre_classified_type:
|
||||
type_instruction = f'\n "document_type": "{pre_classified_type}", // pre-classified, do not change'
|
||||
else:
|
||||
type_instruction = '\n "document_type": "one of: academic_pdf, technical_doc, chat_log, book_excerpt, form, syllabus, email, notes, purchase_order, annual_report, invoice, memo, report, unknown",'
|
||||
|
||||
return f"""Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
|
||||
|
||||
Return exactly this structure:
|
||||
{{{type_instruction}
|
||||
"primary_language": "language code e.g. en, fr, de",
|
||||
"density": "one of: high, medium, low",
|
||||
"has_proper_nouns": true or false,
|
||||
"has_dates": true or false,
|
||||
"has_numeric_data": true or false,
|
||||
"has_institutional_language": true or false,
|
||||
"has_technical_terms": true or false,
|
||||
"likely_has_named_entities": true or false,
|
||||
"structure_signals": [],
|
||||
"noise_signals": [],
|
||||
"extraction_priority": "one of: full, partial, skip"
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- density: high=information dense technical or academic, medium=mixed, low=narrative/literary/sparse/short
|
||||
- has_proper_nouns: true if you see capitalized words that are NOT sentence starts or template headers
|
||||
- has_dates: true if you see date patterns (numbers with months, years, slashes)
|
||||
- has_numeric_data: true if you see measurements, percentages, statistics
|
||||
- has_institutional_language: true if you see words like university, department, policy, committee, grant
|
||||
- has_technical_terms: true if you see domain-specific jargon or acronyms
|
||||
- likely_has_named_entities: true if has_proper_nouns is true
|
||||
- structure_signals: use ONLY these terms: headings, bullet_lists, numbered_lists, tables, code_blocks, citations, footnotes, images, forms, columns, sections
|
||||
- noise_signals: use ONLY these terms: repeated_headers, page_numbers, formatting_artifacts, boilerplate, watermarks, footers, line_numbers, encoding_artifacts, ocr_errors
|
||||
- extraction_priority: full if density=high and likely_has_named_entities=true; skip if density=low AND likely_has_named_entities=false AND content is under 200 words; partial otherwise
|
||||
|
||||
Document:
|
||||
{content[:1500]}"""
|
||||
|
||||
|
||||
def get_sample_documents():
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
|
||||
conn = psycopg2.connect(PG_DSN)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ON (source) id, document, source, created_at
|
||||
FROM embeddings
|
||||
WHERE length(document) > 100
|
||||
AND length(document) < 3000
|
||||
ORDER BY source, random()
|
||||
LIMIT %s
|
||||
""", (SAMPLE_SIZE,))
|
||||
docs = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return docs
|
||||
|
||||
|
||||
def run_briefing(prompt):
|
||||
payload = json.dumps({"model": MODEL, "prompt": prompt, "stream": False}).encode()
|
||||
raw = ""
|
||||
try:
|
||||
req = urllib.request.Request(OLLAMA_URL, data=payload, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
result = json.loads(resp.read().decode())
|
||||
raw = result.get("response", "").strip()
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}") + 1
|
||||
if start == -1 or end == 0:
|
||||
return None, f"NO_JSON: {raw[:200]}"
|
||||
parsed = json.loads(raw[start:end])
|
||||
if not isinstance(parsed, dict):
|
||||
return None, f"NOT_DICT: {raw[:100]}"
|
||||
return parsed, raw
|
||||
except urllib.error.URLError as e:
|
||||
return None, f"URL_ERROR: {e}"
|
||||
except TimeoutError:
|
||||
return None, "TIMEOUT"
|
||||
except json.JSONDecodeError as e:
|
||||
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
|
||||
except Exception as e:
|
||||
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||
|
||||
|
||||
def sanitize_briefing(briefing, pre_classified_type=None):
|
||||
safe = {}
|
||||
if pre_classified_type:
|
||||
safe["document_type"] = pre_classified_type
|
||||
else:
|
||||
dt = str(briefing.get("document_type", "unknown")).lower().strip()
|
||||
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
|
||||
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
|
||||
density = str(briefing.get("density", "medium")).lower().strip()
|
||||
safe["density"] = density if density in VALID_DENSITIES else "medium"
|
||||
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
|
||||
"has_institutional_language", "has_technical_terms", "likely_has_named_entities"]:
|
||||
val = briefing.get(field, False)
|
||||
if isinstance(val, bool):
|
||||
safe[field] = val
|
||||
elif isinstance(val, str):
|
||||
safe[field] = val.lower() in ("true", "yes", "1")
|
||||
else:
|
||||
safe[field] = bool(val)
|
||||
for field, valid_set in [("structure_signals", VALID_STRUCTURE_SIGNALS),
|
||||
("noise_signals", VALID_NOISE_SIGNALS)]:
|
||||
val = briefing.get(field, [])
|
||||
if isinstance(val, list):
|
||||
safe[field] = [str(v).lower().strip() for v in val if str(v).lower().strip() in valid_set]
|
||||
elif isinstance(val, str) and val.lower().strip() in valid_set:
|
||||
safe[field] = [val.lower().strip()]
|
||||
else:
|
||||
safe[field] = []
|
||||
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
|
||||
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
|
||||
return safe
|
||||
|
||||
|
||||
def estimate_token_reduction(original_text, briefing):
|
||||
original_tokens = max(len(original_text) / 4, 1)
|
||||
orientation_saved = 200
|
||||
if briefing.get("extraction_priority") == "skip":
|
||||
return {"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": round(original_tokens + 200),
|
||||
"noise_reduction_pct": 100.0, "total_reduction_pct": 100.0,
|
||||
"note": "skip — no API call"}
|
||||
noise_count = len(briefing.get("noise_signals", []))
|
||||
noise_reduction_pct = min(noise_count * 0.05, 0.40)
|
||||
noise_tokens_saved = original_tokens * noise_reduction_pct
|
||||
total_saved = orientation_saved + noise_tokens_saved
|
||||
reduction_pct = min((total_saved / (original_tokens + 200)) * 100, 99.0)
|
||||
return {"original_tokens_approx": round(original_tokens),
|
||||
"orientation_tokens_saved": orientation_saved,
|
||||
"noise_tokens_saved": round(noise_tokens_saved),
|
||||
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
|
||||
"total_reduction_pct": round(reduction_pct, 1)}
|
||||
|
||||
|
||||
def format_eta(elapsed_times, completed, total):
|
||||
if completed == 0:
|
||||
return "ETA: --:--"
|
||||
avg = sum(elapsed_times) / completed
|
||||
eta = timedelta(seconds=int((total - completed) * avg))
|
||||
return f"ETA: {str(eta)}"
|
||||
|
||||
|
||||
def content_hash(text):
|
||||
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||
|
||||
|
||||
def main():
|
||||
test_start = time.time()
|
||||
print(f"\nBirdAI Briefing Generator v2 — Experiment 002b")
|
||||
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
|
||||
print(f"Changes: rule-based doc_type, template stripping, controlled vocab")
|
||||
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Results: {RESULTS_FILE}")
|
||||
print("-" * 75)
|
||||
|
||||
docs = get_sample_documents()
|
||||
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
|
||||
|
||||
results = {
|
||||
"meta": {"model": MODEL, "version": "v2", "sample_size": len(docs),
|
||||
"started": datetime.now().isoformat(), "completed": None,
|
||||
"total_elapsed_seconds": None, "avg_seconds_per_doc": None},
|
||||
"documents": [], "summary": {}
|
||||
}
|
||||
|
||||
success_count = 0
|
||||
failed_count = 0
|
||||
pre_classified_count = 0
|
||||
priority_counts = {"full": 0, "partial": 0, "skip": 0}
|
||||
total_reduction_pct = 0.0
|
||||
elapsed_times = []
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
doc_id = doc["id"]
|
||||
content = doc["document"]
|
||||
source = doc.get("source", "unknown")
|
||||
chash = content_hash(content)
|
||||
|
||||
pre_type, cleaned_content = pre_classify_document(source, content)
|
||||
was_pre_classified = pre_type is not None
|
||||
if was_pre_classified:
|
||||
pre_classified_count += 1
|
||||
|
||||
eta_str = format_eta(elapsed_times, i, len(docs))
|
||||
pre_flag = "R" if was_pre_classified else "M"
|
||||
print(f"[{i+1:02d}/{len(docs)}][{pre_flag}] {source[:36]:<36} {eta_str:<14}", end=" ", flush=True)
|
||||
|
||||
prompt = build_briefing_prompt(cleaned_content, pre_type)
|
||||
t_start = time.time()
|
||||
briefing, raw = run_briefing(prompt)
|
||||
elapsed = round(time.time() - t_start, 1)
|
||||
elapsed_times.append(elapsed)
|
||||
|
||||
if briefing is None:
|
||||
failed_count += 1
|
||||
print(f"→ FAILED {elapsed}s | {raw[:50]}")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "status": "FAILED",
|
||||
"pre_classified_type": pre_type, "error": raw, "elapsed_seconds": elapsed
|
||||
})
|
||||
else:
|
||||
briefing = sanitize_briefing(briefing, pre_type)
|
||||
success_count += 1
|
||||
priority = briefing["extraction_priority"]
|
||||
doc_type = briefing["document_type"]
|
||||
density = briefing["density"]
|
||||
priority_counts[priority] = priority_counts.get(priority, 0) + 1
|
||||
reduction = estimate_token_reduction(cleaned_content, briefing)
|
||||
total_reduction_pct += reduction["total_reduction_pct"]
|
||||
print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
|
||||
results["documents"].append({
|
||||
"id": doc_id, "source": source, "content_hash": chash,
|
||||
"content_length": len(content), "cleaned_content_length": len(cleaned_content),
|
||||
"status": "SUCCESS", "pre_classified_type": pre_type,
|
||||
"was_pre_classified": was_pre_classified, "elapsed_seconds": elapsed,
|
||||
"briefing": briefing, "token_reduction_estimate": reduction
|
||||
})
|
||||
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
total_elapsed = round(time.time() - test_start, 1)
|
||||
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
|
||||
completed_at = datetime.now().isoformat()
|
||||
results["meta"]["completed"] = completed_at
|
||||
results["meta"]["total_elapsed_seconds"] = total_elapsed
|
||||
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
|
||||
|
||||
total = len(docs)
|
||||
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
|
||||
summary = {
|
||||
"total": total, "success": success_count, "failed": failed_count,
|
||||
"success_rate": round(success_count / total * 100, 1),
|
||||
"pre_classified_by_rule": pre_classified_count,
|
||||
"classified_by_model": total - pre_classified_count,
|
||||
"extraction_priority_breakdown": priority_counts,
|
||||
"avg_token_reduction_pct": avg_reduction,
|
||||
"total_elapsed_seconds": total_elapsed, "avg_seconds_per_doc": avg_per_doc,
|
||||
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
|
||||
"approach_viable": success_count / total >= 0.8
|
||||
}
|
||||
results["summary"] = summary
|
||||
with open(RESULTS_FILE, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
|
||||
print("\n" + "=" * 75)
|
||||
print(f"RESULTS — Briefing Generator v2")
|
||||
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
|
||||
print(f" Failed: {failed_count}")
|
||||
print(f" Pre-classified (rule): {pre_classified_count}")
|
||||
print(f" Classified (model): {total - pre_classified_count}")
|
||||
print(f" Priority — full: {priority_counts.get('full', 0)}")
|
||||
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
|
||||
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
|
||||
print(f" Avg token reduction: {avg_reduction}%")
|
||||
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
|
||||
print(f" Avg per document: {avg_per_doc}s")
|
||||
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
|
||||
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
|
||||
print(f" Completed: {completed_at}")
|
||||
print(f" Full results: {RESULTS_FILE}")
|
||||
print("=" * 75)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user