experiments: add consistency test and briefing generator results + scripts
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,313 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
BirdAI Briefing Generator Test
|
||||||
|
===============================
|
||||||
|
Tests the local LLM as a document briefing generator.
|
||||||
|
The local model produces a structured roadmap for the API —
|
||||||
|
cleaning, structure detection, signal flagging — without semantic judgment.
|
||||||
|
Results written to ~/aaronai/briefing_test_results.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import psycopg2
|
||||||
|
import psycopg2.extras
|
||||||
|
import hashlib
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||||
|
|
||||||
|
PG_DSN = os.getenv("PG_DSN")
|
||||||
|
RESULTS_FILE = os.path.expanduser("~/aaronai/briefing_test_results.json")
|
||||||
|
MODEL = "mistral"
|
||||||
|
SAMPLE_SIZE = 50
|
||||||
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||||
|
|
||||||
|
VALID_DOC_TYPES = {
|
||||||
|
"academic_pdf", "technical_doc", "chat_log", "code",
|
||||||
|
"presentation", "book_excerpt", "form", "syllabus",
|
||||||
|
"email", "notes", "unknown"
|
||||||
|
}
|
||||||
|
VALID_DENSITIES = {"high", "medium", "low"}
|
||||||
|
VALID_PRIORITIES = {"full", "partial", "skip"}
|
||||||
|
|
||||||
|
BRIEFING_PROMPT = """Analyze this document and return a JSON briefing. No explanation, no prose, JSON only.
|
||||||
|
|
||||||
|
Return exactly this structure:
|
||||||
|
{
|
||||||
|
"document_type": "one of: academic_pdf, technical_doc, chat_log, code, presentation, book_excerpt, form, syllabus, email, notes, unknown",
|
||||||
|
"primary_language": "language code e.g. en, fr, de",
|
||||||
|
"density": "one of: high, medium, low",
|
||||||
|
"has_proper_nouns": true or false,
|
||||||
|
"has_dates": true or false,
|
||||||
|
"has_numeric_data": true or false,
|
||||||
|
"has_institutional_language": true or false,
|
||||||
|
"has_technical_terms": true or false,
|
||||||
|
"likely_has_named_entities": true or false,
|
||||||
|
"structure_signals": [],
|
||||||
|
"noise_signals": [],
|
||||||
|
"extraction_priority": "one of: full, partial, skip"
|
||||||
|
}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- document_type: identify from formatting patterns and vocabulary, not meaning
|
||||||
|
- density: high=information dense technical or academic text, medium=mixed, low=narrative/literary/sparse
|
||||||
|
- has_proper_nouns: true if you see capitalized words that are not sentence starts
|
||||||
|
- has_dates: true if you see date patterns (numbers with months, years, slashes)
|
||||||
|
- has_numeric_data: true if you see measurements, percentages, statistics
|
||||||
|
- has_institutional_language: true if you see words like university, department, policy, committee, grant
|
||||||
|
- has_technical_terms: true if you see domain-specific jargon or acronyms
|
||||||
|
- likely_has_named_entities: true if has_proper_nouns is true
|
||||||
|
- structure_signals: list any structural markers you see e.g. ["headings", "bullet_lists", "numbered_lists", "tables", "code_blocks", "citations"]
|
||||||
|
- noise_signals: list any noise patterns you see e.g. ["repeated_headers", "page_numbers", "formatting_artifacts", "boilerplate"]
|
||||||
|
- extraction_priority: full if density=high and likely_has_named_entities=true, skip if density=low and likely_has_named_entities=false, partial otherwise
|
||||||
|
|
||||||
|
Document:
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_sample_documents():
|
||||||
|
if not PG_DSN:
|
||||||
|
raise RuntimeError("PG_DSN not found in .env — cannot connect to database")
|
||||||
|
conn = psycopg2.connect(PG_DSN)
|
||||||
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||||
|
cur.execute("""
|
||||||
|
SELECT DISTINCT ON (source) id, document, source, created_at
|
||||||
|
FROM embeddings
|
||||||
|
WHERE length(document) > 100
|
||||||
|
AND length(document) < 3000
|
||||||
|
ORDER BY source, random()
|
||||||
|
LIMIT %s
|
||||||
|
""", (SAMPLE_SIZE,))
|
||||||
|
docs = cur.fetchall()
|
||||||
|
cur.close()
|
||||||
|
conn.close()
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
def run_briefing(text):
|
||||||
|
prompt = BRIEFING_PROMPT + text[:1500]
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": MODEL,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False
|
||||||
|
}).encode()
|
||||||
|
raw = ""
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
OLLAMA_URL,
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||||
|
result = json.loads(resp.read().decode())
|
||||||
|
raw = result.get("response", "").strip()
|
||||||
|
start = raw.find("{")
|
||||||
|
end = raw.rfind("}") + 1
|
||||||
|
if start == -1 or end == 0:
|
||||||
|
return None, f"NO_JSON: {raw[:200]}"
|
||||||
|
json_str = raw[start:end]
|
||||||
|
parsed = json.loads(json_str)
|
||||||
|
if not isinstance(parsed, dict):
|
||||||
|
return None, f"NOT_DICT: {raw[:100]}"
|
||||||
|
return parsed, raw
|
||||||
|
except urllib.error.URLError as e:
|
||||||
|
return None, f"URL_ERROR: {e}"
|
||||||
|
except TimeoutError:
|
||||||
|
return None, "TIMEOUT"
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
return None, f"JSON_ERROR: {e} | raw: {raw[:200]}"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_briefing(briefing):
|
||||||
|
safe = {}
|
||||||
|
dt = str(briefing.get("document_type", "unknown")).lower().strip()
|
||||||
|
safe["document_type"] = dt if dt in VALID_DOC_TYPES else "unknown"
|
||||||
|
safe["primary_language"] = str(briefing.get("primary_language", "en")).lower().strip()[:10]
|
||||||
|
density = str(briefing.get("density", "medium")).lower().strip()
|
||||||
|
safe["density"] = density if density in VALID_DENSITIES else "medium"
|
||||||
|
for field in ["has_proper_nouns", "has_dates", "has_numeric_data",
|
||||||
|
"has_institutional_language", "has_technical_terms",
|
||||||
|
"likely_has_named_entities"]:
|
||||||
|
val = briefing.get(field, False)
|
||||||
|
if isinstance(val, bool):
|
||||||
|
safe[field] = val
|
||||||
|
elif isinstance(val, str):
|
||||||
|
safe[field] = val.lower() in ("true", "yes", "1")
|
||||||
|
else:
|
||||||
|
safe[field] = bool(val)
|
||||||
|
for field in ["structure_signals", "noise_signals"]:
|
||||||
|
val = briefing.get(field, [])
|
||||||
|
if isinstance(val, list):
|
||||||
|
safe[field] = [str(v) for v in val if v]
|
||||||
|
elif isinstance(val, str):
|
||||||
|
safe[field] = [val] if val else []
|
||||||
|
else:
|
||||||
|
safe[field] = []
|
||||||
|
priority = str(briefing.get("extraction_priority", "partial")).lower().strip()
|
||||||
|
safe["extraction_priority"] = priority if priority in VALID_PRIORITIES else "partial"
|
||||||
|
return safe
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_token_reduction(original_text, briefing):
|
||||||
|
original_tokens = max(len(original_text) / 4, 1)
|
||||||
|
orientation_saved = 200
|
||||||
|
if briefing.get("extraction_priority") == "skip":
|
||||||
|
return {
|
||||||
|
"original_tokens_approx": round(original_tokens),
|
||||||
|
"orientation_tokens_saved": round(original_tokens + 200),
|
||||||
|
"noise_reduction_pct": 100.0,
|
||||||
|
"total_reduction_pct": 100.0,
|
||||||
|
"note": "skip — no API call"
|
||||||
|
}
|
||||||
|
noise_count = len(briefing.get("noise_signals", []))
|
||||||
|
noise_reduction_pct = min(noise_count * 0.05, 0.40)
|
||||||
|
noise_tokens_saved = original_tokens * noise_reduction_pct
|
||||||
|
total_saved = orientation_saved + noise_tokens_saved
|
||||||
|
total_cost = original_tokens + 200
|
||||||
|
reduction_pct = min((total_saved / total_cost) * 100, 99.0)
|
||||||
|
return {
|
||||||
|
"original_tokens_approx": round(original_tokens),
|
||||||
|
"orientation_tokens_saved": orientation_saved,
|
||||||
|
"noise_tokens_saved": round(noise_tokens_saved),
|
||||||
|
"noise_reduction_pct": round(noise_reduction_pct * 100, 1),
|
||||||
|
"total_reduction_pct": round(reduction_pct, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def format_eta(elapsed_times, completed, total):
|
||||||
|
if completed == 0:
|
||||||
|
return "ETA: --:--"
|
||||||
|
avg = sum(elapsed_times) / completed
|
||||||
|
remaining = (total - completed) * avg
|
||||||
|
eta = timedelta(seconds=int(remaining))
|
||||||
|
return f"ETA: {str(eta)}"
|
||||||
|
|
||||||
|
|
||||||
|
def content_hash(text):
|
||||||
|
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
test_start = time.time()
|
||||||
|
print(f"\nBirdAI Briefing Generator Test")
|
||||||
|
print(f"Model: {MODEL} | Sample: {SAMPLE_SIZE} docs (distinct sources)")
|
||||||
|
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f"Results: {RESULTS_FILE}")
|
||||||
|
print("-" * 75)
|
||||||
|
|
||||||
|
docs = get_sample_documents()
|
||||||
|
print(f"Loaded {len(docs)} distinct source documents from pgvector\n")
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"meta": {
|
||||||
|
"model": MODEL,
|
||||||
|
"sample_size": len(docs),
|
||||||
|
"started": datetime.now().isoformat(),
|
||||||
|
"completed": None,
|
||||||
|
"total_elapsed_seconds": None,
|
||||||
|
"avg_seconds_per_doc": None
|
||||||
|
},
|
||||||
|
"documents": [],
|
||||||
|
"summary": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
failed_count = 0
|
||||||
|
priority_counts = {"full": 0, "partial": 0, "skip": 0}
|
||||||
|
total_reduction_pct = 0.0
|
||||||
|
elapsed_times = []
|
||||||
|
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
doc_id = doc["id"]
|
||||||
|
content = doc["document"]
|
||||||
|
source = doc.get("source", "unknown")
|
||||||
|
chash = content_hash(content)
|
||||||
|
eta_str = format_eta(elapsed_times, i, len(docs))
|
||||||
|
print(f"[{i+1:02d}/{len(docs)}] {source[:38]:<38} {eta_str:<14}", end=" ", flush=True)
|
||||||
|
|
||||||
|
t_start = time.time()
|
||||||
|
briefing, raw = run_briefing(content)
|
||||||
|
elapsed = round(time.time() - t_start, 1)
|
||||||
|
elapsed_times.append(elapsed)
|
||||||
|
|
||||||
|
if briefing is None:
|
||||||
|
failed_count += 1
|
||||||
|
print(f"→ FAILED {elapsed}s | {raw[:50]}")
|
||||||
|
results["documents"].append({
|
||||||
|
"id": doc_id, "source": source, "content_hash": chash,
|
||||||
|
"content_length": len(content), "status": "FAILED",
|
||||||
|
"error": raw, "elapsed_seconds": elapsed
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
briefing = sanitize_briefing(briefing)
|
||||||
|
success_count += 1
|
||||||
|
priority = briefing["extraction_priority"]
|
||||||
|
doc_type = briefing["document_type"]
|
||||||
|
density = briefing["density"]
|
||||||
|
priority_counts[priority] = priority_counts.get(priority, 0) + 1
|
||||||
|
reduction = estimate_token_reduction(content, briefing)
|
||||||
|
total_reduction_pct += reduction["total_reduction_pct"]
|
||||||
|
print(f"→ {priority.upper():<7} {doc_type:<15} density:{density:<6} -{reduction['total_reduction_pct']:>5.1f}% {elapsed}s")
|
||||||
|
results["documents"].append({
|
||||||
|
"id": doc_id, "source": source, "content_hash": chash,
|
||||||
|
"content_length": len(content), "status": "SUCCESS",
|
||||||
|
"elapsed_seconds": elapsed, "briefing": briefing,
|
||||||
|
"token_reduction_estimate": reduction
|
||||||
|
})
|
||||||
|
|
||||||
|
with open(RESULTS_FILE, "w") as f:
|
||||||
|
json.dump(results, f, indent=2, default=str)
|
||||||
|
|
||||||
|
total_elapsed = round(time.time() - test_start, 1)
|
||||||
|
avg_per_doc = round(total_elapsed / len(docs), 1) if docs else 0
|
||||||
|
completed_at = datetime.now().isoformat()
|
||||||
|
results["meta"]["completed"] = completed_at
|
||||||
|
results["meta"]["total_elapsed_seconds"] = total_elapsed
|
||||||
|
results["meta"]["avg_seconds_per_doc"] = avg_per_doc
|
||||||
|
|
||||||
|
total = len(docs)
|
||||||
|
avg_reduction = round(total_reduction_pct / success_count, 1) if success_count else 0
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"total": total,
|
||||||
|
"success": success_count,
|
||||||
|
"failed": failed_count,
|
||||||
|
"success_rate": round(success_count / total * 100, 1),
|
||||||
|
"extraction_priority_breakdown": priority_counts,
|
||||||
|
"avg_token_reduction_pct": avg_reduction,
|
||||||
|
"total_elapsed_seconds": total_elapsed,
|
||||||
|
"avg_seconds_per_doc": avg_per_doc,
|
||||||
|
"projected_50_doc_minutes": round((avg_per_doc * 50) / 60, 1),
|
||||||
|
"approach_viable": success_count / total >= 0.8
|
||||||
|
}
|
||||||
|
results["summary"] = summary
|
||||||
|
|
||||||
|
with open(RESULTS_FILE, "w") as f:
|
||||||
|
json.dump(results, f, indent=2, default=str)
|
||||||
|
|
||||||
|
print("\n" + "=" * 75)
|
||||||
|
print(f"RESULTS")
|
||||||
|
print(f" Success rate: {success_count}/{total} ({summary['success_rate']}%)")
|
||||||
|
print(f" Failed: {failed_count}")
|
||||||
|
print(f" Priority — full: {priority_counts.get('full', 0)}")
|
||||||
|
print(f" Priority — partial: {priority_counts.get('partial', 0)}")
|
||||||
|
print(f" Priority — skip: {priority_counts.get('skip', 0)}")
|
||||||
|
print(f" Avg token reduction: {avg_reduction}%")
|
||||||
|
print(f" Total elapsed: {total_elapsed}s ({round(total_elapsed/60, 1)} min)")
|
||||||
|
print(f" Avg per document: {avg_per_doc}s")
|
||||||
|
print(f" Projected 50 docs: {summary['projected_50_doc_minutes']} min")
|
||||||
|
print(f" Approach viable: {'YES' if summary['approach_viable'] else 'NO'}")
|
||||||
|
print(f" Completed: {completed_at}")
|
||||||
|
print(f" Full results: {RESULTS_FILE}")
|
||||||
|
print("=" * 75)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,248 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
BirdAI Cascaded Extraction — Consistency Test
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
import psycopg2
|
||||||
|
import psycopg2.extras
|
||||||
|
import hashlib
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv(os.path.expanduser("~/aaronai/.env"))
|
||||||
|
|
||||||
|
PG_DSN = os.getenv("PG_DSN")
|
||||||
|
RESULTS_FILE = os.path.expanduser("~/aaronai/consistency_test_results.json")
|
||||||
|
MODEL = "mistral"
|
||||||
|
PASSES = 3
|
||||||
|
SAMPLE_SIZE = 50
|
||||||
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||||
|
|
||||||
|
EXTRACTION_PROMPT = """Extract named entities from this text. Return JSON only, no explanation, no prose.
|
||||||
|
Use exactly these fields (omit any field you are uncertain about, use empty list if none found):
|
||||||
|
{
|
||||||
|
"people": [],
|
||||||
|
"organizations": [],
|
||||||
|
"locations": [],
|
||||||
|
"dates": [],
|
||||||
|
"document_type": ""
|
||||||
|
}
|
||||||
|
Rules:
|
||||||
|
- Every value in people, organizations, locations, dates must be a plain string
|
||||||
|
- document_type must be a plain string
|
||||||
|
- No nested objects, no nested lists
|
||||||
|
- Only include entities you are certain about
|
||||||
|
- If uncertain about anything, omit it
|
||||||
|
Text: """
|
||||||
|
|
||||||
|
|
||||||
|
def get_sample_documents():
|
||||||
|
conn = psycopg2.connect(PG_DSN)
|
||||||
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id, document, source, created_at
|
||||||
|
FROM embeddings
|
||||||
|
WHERE length(document) > 100
|
||||||
|
AND length(document) < 3000
|
||||||
|
ORDER BY random()
|
||||||
|
LIMIT %s
|
||||||
|
""", (SAMPLE_SIZE,))
|
||||||
|
docs = cur.fetchall()
|
||||||
|
cur.close()
|
||||||
|
conn.close()
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
def run_extraction(text):
|
||||||
|
prompt = EXTRACTION_PROMPT + text[:1500]
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": MODEL,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False
|
||||||
|
}).encode()
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
OLLAMA_URL,
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||||
|
result = json.loads(resp.read().decode())
|
||||||
|
raw = result.get("response", "").strip()
|
||||||
|
start = raw.find("{")
|
||||||
|
end = raw.rfind("}") + 1
|
||||||
|
if start == -1 or end == 0:
|
||||||
|
return None, f"NO_JSON: {raw[:100]}"
|
||||||
|
json_str = raw[start:end]
|
||||||
|
parsed = json.loads(json_str)
|
||||||
|
if not isinstance(parsed, dict):
|
||||||
|
return None, f"NOT_DICT: {json_str[:100]}"
|
||||||
|
return parsed, raw
|
||||||
|
except urllib.error.URLError as e:
|
||||||
|
return None, f"URL_ERROR: {e}"
|
||||||
|
except TimeoutError:
|
||||||
|
return None, "TIMEOUT"
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
return None, f"JSON_ERROR: {e}"
|
||||||
|
except Exception as e:
|
||||||
|
return None, f"ERROR: {type(e).__name__}: {e}"
|
||||||
|
|
||||||
|
|
||||||
|
def flatten_value(v):
|
||||||
|
if isinstance(v, str):
|
||||||
|
return v.lower().strip()
|
||||||
|
elif isinstance(v, dict):
|
||||||
|
return json.dumps(v, sort_keys=True).lower()
|
||||||
|
elif isinstance(v, list):
|
||||||
|
return json.dumps(sorted([flatten_value(i) for i in v]))
|
||||||
|
else:
|
||||||
|
return str(v).lower().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_extraction(extracted):
|
||||||
|
if extracted is None:
|
||||||
|
return None
|
||||||
|
normalized = {}
|
||||||
|
expected_fields = ["people", "organizations", "locations", "dates", "document_type"]
|
||||||
|
for key in expected_fields:
|
||||||
|
val = extracted.get(key, [] if key != "document_type" else "")
|
||||||
|
if isinstance(val, list):
|
||||||
|
normalized[key] = sorted([flatten_value(v) for v in val])
|
||||||
|
else:
|
||||||
|
normalized[key] = flatten_value(val)
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def extractions_consistent(extractions):
|
||||||
|
if any(e is None for e in extractions):
|
||||||
|
return False
|
||||||
|
normalized = [normalize_extraction(e) for e in extractions]
|
||||||
|
if any(n is None for n in normalized):
|
||||||
|
return False
|
||||||
|
return all(n == normalized[0] for n in normalized[1:])
|
||||||
|
|
||||||
|
|
||||||
|
def content_hash(text):
|
||||||
|
return hashlib.md5(text.encode()).hexdigest()[:8]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print(f"\nBirdAI Consistency Test")
|
||||||
|
print(f"Model: {MODEL} | Passes: {PASSES} | Sample: {SAMPLE_SIZE} docs")
|
||||||
|
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f"Results: {RESULTS_FILE}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
docs = get_sample_documents()
|
||||||
|
print(f"Loaded {len(docs)} documents from pgvector\n")
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"meta": {
|
||||||
|
"model": MODEL,
|
||||||
|
"passes": PASSES,
|
||||||
|
"sample_size": len(docs),
|
||||||
|
"started": datetime.now().isoformat(),
|
||||||
|
"completed": None
|
||||||
|
},
|
||||||
|
"documents": [],
|
||||||
|
"summary": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
consistent_count = 0
|
||||||
|
failed_count = 0
|
||||||
|
timeout_count = 0
|
||||||
|
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
doc_id = doc["id"]
|
||||||
|
content = doc["document"]
|
||||||
|
source = doc.get("source", "unknown")
|
||||||
|
chash = content_hash(content)
|
||||||
|
|
||||||
|
print(f"[{i+1:02d}/{len(docs)}] {source[:50]:<50} hash:{chash}", end=" ", flush=True)
|
||||||
|
|
||||||
|
passes = []
|
||||||
|
pass_times = []
|
||||||
|
raw_outputs = []
|
||||||
|
|
||||||
|
for p in range(PASSES):
|
||||||
|
t_start = time.time()
|
||||||
|
extracted, raw = run_extraction(content)
|
||||||
|
t_end = time.time()
|
||||||
|
passes.append(extracted)
|
||||||
|
pass_times.append(round(t_end - t_start, 1))
|
||||||
|
raw_outputs.append(raw[:200] if raw else "")
|
||||||
|
|
||||||
|
consistent = extractions_consistent(passes)
|
||||||
|
any_timeout = any("TIMEOUT" in str(r) for r in raw_outputs)
|
||||||
|
any_failed = any(p is None for p in passes)
|
||||||
|
|
||||||
|
if any_timeout:
|
||||||
|
timeout_count += 1
|
||||||
|
status = "TIMEOUT"
|
||||||
|
elif any_failed:
|
||||||
|
failed_count += 1
|
||||||
|
status = "FAILED"
|
||||||
|
elif consistent:
|
||||||
|
consistent_count += 1
|
||||||
|
status = "CONSISTENT"
|
||||||
|
else:
|
||||||
|
status = "INCONSISTENT"
|
||||||
|
|
||||||
|
print(f"→ {status} ({'/'.join(str(t) for t in pass_times)}s)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
sample_extraction = normalize_extraction(passes[0]) if passes[0] else None
|
||||||
|
except Exception:
|
||||||
|
sample_extraction = None
|
||||||
|
|
||||||
|
results["documents"].append({
|
||||||
|
"id": doc_id,
|
||||||
|
"source": source,
|
||||||
|
"content_hash": chash,
|
||||||
|
"content_length": len(content),
|
||||||
|
"status": status,
|
||||||
|
"consistent": consistent,
|
||||||
|
"pass_times_seconds": pass_times,
|
||||||
|
"extraction_sample": sample_extraction,
|
||||||
|
"raw_samples": raw_outputs
|
||||||
|
})
|
||||||
|
|
||||||
|
with open(RESULTS_FILE, "w") as f:
|
||||||
|
json.dump(results, f, indent=2, default=str)
|
||||||
|
|
||||||
|
total = len(docs)
|
||||||
|
completed_at = datetime.now().isoformat()
|
||||||
|
results["meta"]["completed"] = completed_at
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"total": total,
|
||||||
|
"consistent": consistent_count,
|
||||||
|
"inconsistent": total - consistent_count - failed_count - timeout_count,
|
||||||
|
"failed": failed_count,
|
||||||
|
"timeout": timeout_count,
|
||||||
|
"consistency_rate": round(consistent_count / total * 100, 1),
|
||||||
|
"cascade_viable": consistent_count / total >= 0.5
|
||||||
|
}
|
||||||
|
results["summary"] = summary
|
||||||
|
|
||||||
|
with open(RESULTS_FILE, "w") as f:
|
||||||
|
json.dump(results, f, indent=2, default=str)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print(f"RESULTS")
|
||||||
|
print(f" Consistent: {consistent_count}/{total} ({summary['consistency_rate']}%)")
|
||||||
|
print(f" Inconsistent: {summary['inconsistent']}")
|
||||||
|
print(f" Failed/Timeout: {failed_count + timeout_count}")
|
||||||
|
print(f" Cascade viable: {'YES' if summary['cascade_viable'] else 'NO — reconsider architecture'}")
|
||||||
|
print(f" Completed: {completed_at}")
|
||||||
|
print(f" Full results: {RESULTS_FILE}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user