""" E1.6 domain-purity rating interface — with full metadata context. """ import json import os import random E14_RESULTS = "/home/aaron/aaronai/experiments/e14_cascade_results.json" RATINGS_OUT = "/home/aaron/aaronai/experiments/e16_purity_ratings.json" INTRO = """ ================================================================================ E1.6 — DOMAIN-PURITY RATING ================================================================================ Two ratings per source: 1. BINARY — single-domain (s) or multi-domain (m)? Mental test: "If Mistral had to pick ONE domain class for this source, would picking just one significantly UNDER-DESCRIBE the content?" YES → MULTI-DOMAIN (m) — content lives across two+ frames meaningfully NO → SINGLE-DOMAIN (s) — content fits cleanly within one frame 2. SCORE (1-5) — how cleanly does it fit? 5 = unambiguously one domain 4 = primarily one domain, slight other element 3 = balanced two-domain 2 = primarily two-domain with traces of a third 1 = three or more domain frames weighted significantly Single binary usually = score 4-5 Multi binary usually = score 1-3 You see for each source: name, length, AND the full Mistral metadata block (domain_class, primary_format, structural_signals, content_signals, summary). Blind to: bucket assignment, cascade outcome. Commands at any prompt: 's', 'm', 'skip', 'quit' ================================================================================ """.strip() def load_existing(): if os.path.exists(RATINGS_OUT): with open(RATINGS_OUT) as f: return json.load(f) return {"ratings": [], "completed_names": []} def save(data): with open(RATINGS_OUT, "w") as f: json.dump(data, f, indent=2) def render_metadata(metadata): """Pretty-print the full Mistral metadata block.""" if not isinstance(metadata, dict): print(" (metadata unavailable)") return if 'error' in metadata: print(f" (metadata error: {metadata['error']})") return # Render fields in a stable order field_order = [ 'domain_class', 'primary_format', 'structural_signals', 'content_signals', 'summary', ] for field in field_order: if field in metadata: value = metadata[field] label = field.replace('_', ' ').title() if isinstance(value, list): if value: print(f" {label}:") for item in value: print(f" - {item}") else: print(f" {label}: (none)") elif isinstance(value, str): # Wrap long strings if len(value) > 70: print(f" {label}:") print(f" {value}") else: print(f" {label}: {value}") else: print(f" {label}: {value}") # Show any other fields not in the standard order other_fields = [k for k in metadata.keys() if k not in field_order and k != 'char_length'] for field in other_fields: value = metadata[field] label = field.replace('_', ' ').title() print(f" {label}: {value}") def render_source(src, idx, total): print() print("=" * 80) print(f" Source {idx}/{total}") print("=" * 80) print(f"Name: {src['name']}") print(f"Length: {src['doc_chars']:,} chars") print() print("Mistral metadata:") print() render_metadata(src.get('metadata', {})) print() print("-" * 80) def get_rating(): while True: binary = input("Single-domain or multi-domain? [s/m/skip/quit]: ").strip().lower() if binary in ('s', 'm', 'skip', 'quit'): break print(" Please enter 's', 'm', 'skip', or 'quit'") if binary == 'quit': return 'quit' if binary == 'skip': return None while True: try: score_input = input("Purity score (1=many frames, 5=clearly single): ").strip() if score_input.lower() == 'quit': return 'quit' score = int(score_input) if 1 <= score <= 5: break print(" Score must be 1-5") except ValueError: print(" Please enter a number 1-5 (or 'quit')") note = input("Optional note (Enter to skip): ").strip() return { "binary": "single" if binary == 's' else "multi", "score": score, "note": note if note else None, } def main(): with open(E14_RESULTS) as f: e14 = json.load(f) sources = [r for r in e14['results'] if 'submit_result' in r] rng = random.Random(42) shuffled = list(sources) rng.shuffle(shuffled) state = load_existing() completed = set(state['completed_names']) remaining = [s for s in shuffled if s['name'] not in completed] print(INTRO) print() print(f"Total sources: {len(sources)}") print(f"Already rated: {len(completed)}") print(f"Remaining: {len(remaining)}") print() if not remaining: print("All sources rated. Run analysis script next.") return input("Press Enter to begin...") try: for i, src in enumerate(remaining, start=len(completed) + 1): render_source(src, i, len(sources)) try: rating = get_rating() except (KeyboardInterrupt, EOFError): print("\n\nSaving and exiting...") save(state) return if rating == 'quit': print("\nSaving and exiting...") save(state) return if rating is None: print(" Skipped") continue rating['name'] = src['name'] state['ratings'].append(rating) state['completed_names'].append(src['name']) save(state) print(f" Recorded: {rating['binary']}-domain, score={rating['score']}") print() print("=" * 80) print(f"Done. Rated {len(state['ratings'])} sources.") print(f"Saved to {RATINGS_OUT}") except (KeyboardInterrupt, EOFError): print("\n\nSaving...") save(state) if __name__ == "__main__": main()