scripts/: separate production from experimental and deprecated

Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2, base_class, cascade, cost_test, briefing, consistency, token series). Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py, tier1_migration.py — under the bespoke decision both target retired substrate work). Removes 19 .bak* files from disk (gitignored, never tracked; git history is the durable record of every prior version). The 11 production scripts remain in scripts/. All systemd ExecStart paths, api.py subprocess calls, and cron jobs continue to resolve correctly — verified by grep against /etc/systemd/system/aaronai-*.service, scripts/ references in api.py, and the user crontab. Track 1 inventory cross-cutting finding: scripts/ mixed 11 production files with 32 experimental scripts and ~20 .bak files. After this commit a clean-room reader can identify the live workers from a directory listing alone. Found by Track 1 inventory 2026-05-02. See ~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning. After commit, run: 1. git log --oneline -3 — show the new commit on top 2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
2026-05-02 23:28:24 +00:00
parent 6f2d274d5d
commit 3f7fba7e0e
30 changed files with 0 additions and 0 deletions
@@ -0,0 +1,551 @@
+"""
+Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
+
+Reads entities from FalkorDB group_id 'aaron', infers light type labels,
+computes pairwise similarity within type blocks using ego summary embedding +
+name string distance + neighbor pattern overlap, generates merge proposals
+above threshold, writes proposal log for human review.
+
+Does NOT execute merges. 0.1 is the calibration phase — proposals only,
+human reviews before any action.
+"""
+import json
+import re
+import os
+import time
+from datetime import datetime, timezone
+from collections import defaultdict
+from pathlib import Path
+
+import requests
+from falkordb import FalkorDB
+import numpy as np
+
+# Configuration
+GROUP_ID = "aaron"
+HIGH_CONFIDENCE_THRESHOLD = 0.85  # propose merge above this
+LOW_CONFIDENCE_THRESHOLD = 0.65   # log as low-confidence below
+PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
+PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def cosine_similarity(a, b):
+    """Cosine similarity between two embedding vectors."""
+    a = np.array(a, dtype=np.float32)
+    b = np.array(b, dtype=np.float32)
+    na = np.linalg.norm(a)
+    nb = np.linalg.norm(b)
+    if na == 0 or nb == 0:
+        return 0.0
+    return float(np.dot(a, b) / (na * nb))
+
+
+def name_similarity(name_a, name_b):
+    """
+    Token-overlap-based name similarity.
+    Handles formal/informal pairs (Aaron / Aaron Nelson),
+    abbreviation pairs (HVAMC / Hudson Valley AMC),
+    and simple transcription noise.
+    """
+    a_lower = name_a.lower().strip()
+    b_lower = name_b.lower().strip()
+
+    if a_lower == b_lower:
+        return 1.0
+
+    # Tokenize
+    a_tokens = set(re.findall(r'\b\w+\b', a_lower))
+    b_tokens = set(re.findall(r'\b\w+\b', b_lower))
+
+    if not a_tokens or not b_tokens:
+        return 0.0
+
+    # Substring containment (handles "Aaron" in "Aaron Nelson")
+    if a_lower in b_lower or b_lower in a_lower:
+        # Strong signal but not 1.0 — different lengths
+        shorter = min(len(a_lower), len(b_lower))
+        longer = max(len(a_lower), len(b_lower))
+        return 0.7 + 0.2 * (shorter / longer)
+
+    # Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
+    intersection = a_tokens & b_tokens
+    union = a_tokens | b_tokens
+    jaccard = len(intersection) / len(union)
+
+    # Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
+    def is_acronym(short, full):
+        if len(short) >= len(full):
+            return False
+        if not short.isupper():
+            short_upper = short.upper()
+        else:
+            short_upper = short
+        full_words = full.split()
+        if len(full_words) < 2:
+            return False
+        first_letters = ''.join(w[0].upper() for w in full_words if w)
+        return short_upper == first_letters or short_upper in first_letters
+
+    if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
+        return 0.85
+
+    return jaccard
+
+
+def infer_type(entity_name, summary):
+    """
+    Light type inference for blocking. Heuristic-based, transparent.
+    Returns one of: person, organization, project, place, concept, unknown.
+
+    NOT a precise classification — just enough to avoid obviously wrong
+    cross-type comparisons (person vs project). When in doubt, return
+    'unknown' which gets compared against everything.
+    """
+    name_lower = entity_name.lower().strip()
+    summary_lower = (summary or "").lower()
+
+    # Person: name patterns
+    person_indicators = [
+        # First+Last name pattern (two title-cased words, no other tokens)
+        bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
+        # Single name that's also in the summary as a person
+        any(phrase in summary_lower for phrase in [
+            'is a person', 'is a professor', 'is an artist', 'is a colleague',
+            'is a friend', 'is a family member', 'works at', 'studied at',
+            "'s spouse", "'s child", "'s parent", "'s student",
+        ]),
+    ]
+    if any(person_indicators):
+        return "person"
+
+    # Organization: company/institution indicators
+    org_indicators = [
+        any(suffix in name_lower for suffix in [
+            ' inc', ' llc', ' corp', ' company', ' university', ' college',
+            ' school', ' institute', ' foundation', ' department',
+        ]),
+        any(phrase in summary_lower for phrase in [
+            'is a company', 'is a university', 'is an organization',
+            'is an institution', 'is a department', 'is a nonprofit',
+        ]),
+    ]
+    if any(org_indicators):
+        return "organization"
+
+    # Project: software/creative work indicators
+    project_indicators = [
+        any(phrase in summary_lower for phrase in [
+            'is a project', 'software project', 'is a codebase',
+            'is a tool', 'is a system', 'is an application',
+            'is a research project', 'is a design project',
+        ]),
+        any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
+    ]
+    if any(project_indicators):
+        return "project"
+
+    # Place: location indicators
+    place_indicators = [
+        any(phrase in summary_lower for phrase in [
+            'is a city', 'is a town', 'is a state', 'is a country',
+            'is a neighborhood', 'is a region', 'is a location',
+        ]),
+    ]
+    if any(place_indicators):
+        return "place"
+
+    # Default
+    return "unknown"
+
+
+def get_neighbors(graph, entity_uuid, limit=20):
+    """Get the names of entities connected to this entity (1-hop)."""
+    query = """
+    MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
+    RETURN DISTINCT other.name AS name
+    LIMIT $limit
+    """
+    result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
+    return set(row[0] for row in result.result_set if row[0])
+
+
+def neighbor_jaccard(neighbors_a, neighbors_b):
+    """
+    Asymmetric neighbor overlap (containment metric).
+
+    Returns |A ∩ B| / min(|A|, |B|) — the fraction of the smaller entity's
+    neighbors that are also neighbors of the larger entity.
+
+    Asymmetric is the right metric for personal cognitive corpora, where
+    one entity (e.g., the user) is a hub with hundreds of edges and alias
+    candidates are smaller subset entities. Jaccard penalizes this
+    asymmetry as if it were dissimilarity; containment reveals it as the
+    subset relationship it is.
+
+    DEG-RAG used Jaccard because their academic-corpus entities are
+    roughly comparable in connectivity. Personal corpora have different
+    topology and need a different metric.
+    """
+    if not neighbors_a and not neighbors_b:
+        return 0.0
+    intersection = neighbors_a & neighbors_b
+    smaller = min(len(neighbors_a), len(neighbors_b))
+    if smaller == 0:
+        return 0.0
+    return len(intersection) / smaller
+
+
+def get_edge_count(graph, entity_uuid):
+    query = """
+    MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
+    RETURN count(r) AS c
+    """
+    result = graph.query(query, {"uuid": entity_uuid})
+    return result.result_set[0][0] if result.result_set else 0
+
+
+def combine_signals(name_sim, ego_sim, neighbor_sim):
+    """
+    Combine the three similarity signals into a single confidence score.
+
+    Weighting tuned for personal cognitive corpora:
+    - Summary embedding ego similarity is primary signal
+    - Containment-based neighbor overlap is strong secondary (catches Aaron+Nelson
+      where the smaller entity's neighbors are mostly a subset of the hub's)
+    - Name similarity is tie-breaker (handles acronyms via name_similarity helper)
+
+    Different from DEG-RAG defaults because personal corpora have asymmetric
+    topology (hub user, subset alias entities).
+    """
+    # Strong neighbor containment alone is meaningful — if entity B's neighbors
+    # are mostly contained in entity A's, even with different names and weak
+    # name_embedding similarity, that's the asymmetric alias case (Aaron+Nelson).
+    # Require some ego support but not high.
+    if neighbor_sim >= 0.7 and ego_sim >= 0.3:
+        return 0.4 * neighbor_sim + 0.4 * ego_sim + 0.2 * name_sim
+
+    # If ego is very low AND neighbor overlap is weak, probably not aliases
+    if ego_sim < 0.3 and neighbor_sim < 0.4:
+        return min(0.4, max(ego_sim, neighbor_sim))
+
+    # If name is very similar AND ego is at least moderate, high confidence
+    if name_sim >= 0.85 and ego_sim >= 0.5:
+        return 0.4 * ego_sim + 0.4 * name_sim + 0.2 * neighbor_sim
+
+    # Standard weighted average — ego primary, neighbor and name balanced
+    return 0.45 * ego_sim + 0.3 * neighbor_sim + 0.25 * name_sim
+
+
+def compute_summary_embedding(text, model="nomic-embed-text"):
+    """
+    Compute embedding for a summary text via Ollama.
+
+    Used to get ego similarity between entities based on what their summaries
+    say (the actual semantic content) rather than just their names. Aaron's
+    name_embedding and Nelson's name_embedding have low cosine similarity
+    because the names are different tokens. But their summaries describe
+    overlapping content (faculty member at SUNY, HVAMC, etc.) so summary
+    embeddings should produce a much stronger ego signal.
+    """
+    if not text or len(text) < 10:
+        return None
+    try:
+        response = requests.post(
+            "http://localhost:11434/api/embeddings",
+            json={"model": model, "prompt": text[:2000]},
+            timeout=30,
+        )
+        response.raise_for_status()
+        return response.json().get("embedding")
+    except Exception as e:
+        print(f"  Embedding error: {e}")
+        return None
+
+
+def precompute_summary_embeddings(entities, model="nomic-embed-text"):
+    """Compute and cache summary embeddings for all entities."""
+    print(f"Computing summary embeddings via Ollama ({model})...")
+    print(f"  Total entities: {len(entities)}")
+
+    cache_path = Path("/home/aaron/aaronai/experiments/summary_embeddings_cache.json")
+    cache = {}
+    if cache_path.exists():
+        with open(cache_path) as f:
+            cache = json.load(f)
+        print(f"  Loaded {len(cache)} cached embeddings")
+
+    new_count = 0
+    start = time.time()
+    for i, e in enumerate(entities):
+        if e["uuid"] in cache:
+            e["summary_embedding"] = cache[e["uuid"]]
+            continue
+        emb = compute_summary_embedding(e["summary"], model=model)
+        if emb:
+            e["summary_embedding"] = emb
+            cache[e["uuid"]] = emb
+            new_count += 1
+        else:
+            e["summary_embedding"] = None
+
+        # Save cache periodically
+        if new_count > 0 and new_count % 100 == 0:
+            with open(cache_path, "w") as f:
+                json.dump(cache, f)
+            elapsed = time.time() - start
+            rate = new_count / elapsed
+            remaining = (len(entities) - i - 1) / rate if rate > 0 else 0
+            print(f"  ... {i+1}/{len(entities)} (computed {new_count} new, ~{remaining:.0f}s remaining)")
+
+    # Final save
+    with open(cache_path, "w") as f:
+        json.dump(cache, f)
+
+    have_embeddings = sum(1 for e in entities if e.get("summary_embedding"))
+    print(f"  Done. {have_embeddings}/{len(entities)} entities have summary embeddings")
+
+
+def generate_proposals():
+    db = FalkorDB(host='localhost', port=6379)
+    graph = db.select_graph(GROUP_ID)
+
+    # Pull all entities with embeddings
+    print(f"Fetching entities from group_id '{GROUP_ID}'...")
+    result = graph.query("""
+    MATCH (n:Entity)
+    WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
+    RETURN n.uuid, n.name, n.summary, n.name_embedding
+    """)
+
+    entities = []
+    for row in result.result_set:
+        entities.append({
+            'uuid': row[0],
+            'name': row[1],
+            'summary': row[2],
+            'embedding': row[3],
+        })
+    print(f"  Loaded {len(entities)} entities with embeddings")
+
+    # Compute summary embeddings (true ego signal, beyond name embeddings)
+    precompute_summary_embeddings(entities)
+
+    # Infer types for blocking
+    print("Inferring entity types for blocking...")
+    type_counts = defaultdict(int)
+    for e in entities:
+        e['inferred_type'] = infer_type(e['name'], e['summary'])
+        type_counts[e['inferred_type']] += 1
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"  {t}: {c}")
+
+    # Group by inferred type for blocking
+    blocks = defaultdict(list)
+    for e in entities:
+        blocks[e['inferred_type']].append(e)
+
+    # 'unknown' entities get compared against everything (they might be any type)
+    # Other types only get compared within their type block + against unknowns
+    print()
+    print("Comparing entities within type blocks...")
+    proposals = []
+    low_confidence = []
+    comparisons_done = 0
+
+    # Build comparison pairs
+    pairs_to_compare = []
+    typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
+    unknown_block = blocks.get('unknown', [])
+
+    # Within-type pairs (excluding unknown)
+    for t, ents in typed_blocks.items():
+        for i in range(len(ents)):
+            for j in range(i + 1, len(ents)):
+                pairs_to_compare.append((ents[i], ents[j]))
+
+    # Unknown vs unknown
+    for i in range(len(unknown_block)):
+        for j in range(i + 1, len(unknown_block)):
+            pairs_to_compare.append((unknown_block[i], unknown_block[j]))
+
+    # Unknown vs typed (unknowns might be any type)
+    for ent_unknown in unknown_block:
+        for t, ents in typed_blocks.items():
+            for ent_typed in ents:
+                pairs_to_compare.append((ent_unknown, ent_typed))
+
+    print(f"  Pairs to compare: {len(pairs_to_compare):,}")
+
+    # Compute similarities
+    cache_neighbors = {}
+    def neighbors_cached(uuid):
+        if uuid not in cache_neighbors:
+            cache_neighbors[uuid] = get_neighbors(graph, uuid)
+        return cache_neighbors[uuid]
+
+    for ent_a, ent_b in pairs_to_compare:
+        comparisons_done += 1
+        if comparisons_done % 5000 == 0:
+            print(f"  ... {comparisons_done:,} / {len(pairs_to_compare):,}")
+
+        # Compute name similarity (handles formal/informal pairs, acronyms)
+        name_sim = name_similarity(ent_a['name'], ent_b['name'])
+
+        # Compute ego similarity using SUMMARY embeddings (the actual semantic
+        # content), falling back to name embeddings if summaries unavailable.
+        # Summary similarity catches Aaron+Nelson where name similarity fails.
+        if ent_a.get('summary_embedding') and ent_b.get('summary_embedding'):
+            ego_sim_quick = cosine_similarity(ent_a['summary_embedding'], ent_b['summary_embedding'])
+        else:
+            ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
+
+        # Pre-filter to avoid expensive neighbor query on obviously different pairs.
+        # Lowered thresholds vs DEG-RAG defaults because personal-corpus aliases often
+        # have low name_embedding similarity (different surface tokens) but high
+        # neighbor overlap. We let weaker name/ego signals through to the neighbor
+        # check, which can rescue them via containment metric.
+        if ego_sim_quick < 0.3 and name_sim < 0.15:
+            continue
+
+        # Full comparison
+        neighbors_a = neighbors_cached(ent_a['uuid'])
+        neighbors_b = neighbors_cached(ent_b['uuid'])
+        neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
+
+        confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
+
+        record = {
+            'entity_a': {
+                'uuid': ent_a['uuid'],
+                'name': ent_a['name'],
+                'type': ent_a['inferred_type'],
+                'summary': ent_a['summary'][:200],
+                'edge_count': get_edge_count(graph, ent_a['uuid']),
+            },
+            'entity_b': {
+                'uuid': ent_b['uuid'],
+                'name': ent_b['name'],
+                'type': ent_b['inferred_type'],
+                'summary': ent_b['summary'][:200],
+                'edge_count': get_edge_count(graph, ent_b['uuid']),
+            },
+            'confidence': round(confidence, 3),
+            'signals': {
+                'name_similarity': round(name_sim, 3),
+                'ego_similarity': round(ego_sim_quick, 3),
+                'neighbor_overlap': round(neighbor_sim, 3),
+            },
+            'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
+        }
+
+        if confidence >= HIGH_CONFIDENCE_THRESHOLD:
+            proposals.append(record)
+        elif confidence >= LOW_CONFIDENCE_THRESHOLD:
+            low_confidence.append(record)
+
+    print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
+    return proposals, low_confidence, len(entities), len(pairs_to_compare)
+
+
+def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
+    out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
+
+    proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
+    low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
+
+    lines = []
+    lines.append(f"# Consolidator 0.1 — Run {timestamp}")
+    lines.append("")
+    lines.append("## Statistics")
+    lines.append(f"- Entities scanned: {total_entities:,}")
+    lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
+    lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
+    lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
+    lines.append("")
+    lines.append("## How to review")
+    lines.append("")
+    lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
+    lines.append("- `[APPROVE]` — execute this merge on next run")
+    lines.append("- `[REJECT]` — don't merge, don't propose again")
+    lines.append("- `[DEFER]` — re-surface in next run for further consideration")
+    lines.append("")
+    lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    lines.append(f"## Proposed Merges (n={len(proposals)})")
+    lines.append("")
+
+    for i, p in enumerate(proposals_sorted, start=1):
+        lines.append(f"### Proposal {i}")
+        lines.append("")
+        lines.append(f"**Decision:** [ ]")
+        lines.append("")
+        lines.append(f"**Confidence:** {p['confidence']}")
+        lines.append("")
+        lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
+        lines.append(f"  - uuid: `{p['entity_a']['uuid']}`")
+        lines.append(f"  - summary: {p['entity_a']['summary']}")
+        lines.append("")
+        lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
+        lines.append(f"  - uuid: `{p['entity_b']['uuid']}`")
+        lines.append(f"  - summary: {p['entity_b']['summary']}")
+        lines.append("")
+        lines.append(f"**Signals:**")
+        lines.append(f"  - Name similarity: {p['signals']['name_similarity']}")
+        lines.append(f"  - Ego (summary) similarity: {p['signals']['ego_similarity']}")
+        lines.append(f"  - Neighbor overlap: {p['signals']['neighbor_overlap']}")
+        if p['shared_neighbors']:
+            shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
+            lines.append(f"  - Shared neighbors (sample): {shared_str}")
+        lines.append("")
+        lines.append("**Optional rejection note:** ")
+        lines.append("")
+        lines.append("---")
+        lines.append("")
+
+    lines.append("")
+    lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
+    lines.append("")
+    for p in low_sorted[:30]:
+        lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
+    if len(low_sorted) > 30:
+        lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
+
+    out_path.write_text("\n".join(lines))
+    print(f"\nProposal log written to: {out_path}")
+
+    # Also save raw JSON for downstream tooling
+    json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
+    with open(json_path, 'w') as f:
+        json.dump({
+            'run_timestamp': timestamp,
+            'statistics': {
+                'total_entities': total_entities,
+                'total_comparisons': total_comparisons,
+                'proposal_count': len(proposals),
+                'low_confidence_count': len(low_confidence),
+            },
+            'proposals': proposals_sorted,
+            'low_confidence': low_sorted,
+        }, f, indent=2)
+    print(f"Raw JSON: {json_path}")
+
+
+def main():
+    print("=" * 70)
+    print("Consolidator 0.1 — Calibration Phase")
+    print("=" * 70)
+    print()
+
+    proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
+    write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
+
+    print()
+    print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
+    print("for each proposal. Re-run will read decisions and execute approved merges.")
+
+
+if __name__ == "__main__":
+    main()