aaronAI/scripts/consolidator_v0_1.py.bak

"""
Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.

Reads entities from FalkorDB group_id 'aaron', infers light type labels,
computes pairwise similarity within type blocks using ego summary embedding +
name string distance + neighbor pattern overlap, generates merge proposals
above threshold, writes proposal log for human review.

Does NOT execute merges. 0.1 is the calibration phase — proposals only,
human reviews before any action.
"""
import json
import re
import os
from datetime import datetime, timezone
from collections import defaultdict
from pathlib import Path

from falkordb import FalkorDB
import numpy as np

# Configuration
GROUP_ID = "aaron"
HIGH_CONFIDENCE_THRESHOLD = 0.85  # propose merge above this
LOW_CONFIDENCE_THRESHOLD = 0.65   # log as low-confidence below
PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)


def cosine_similarity(a, b):
    """Cosine similarity between two embedding vectors."""
    a = np.array(a, dtype=np.float32)
    b = np.array(b, dtype=np.float32)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))


def name_similarity(name_a, name_b):
    """
    Token-overlap-based name similarity.
    Handles formal/informal pairs (Aaron / Aaron Nelson),
    abbreviation pairs (HVAMC / Hudson Valley AMC),
    and simple transcription noise.
    """
    a_lower = name_a.lower().strip()
    b_lower = name_b.lower().strip()

    if a_lower == b_lower:
        return 1.0

    # Tokenize
    a_tokens = set(re.findall(r'\b\w+\b', a_lower))
    b_tokens = set(re.findall(r'\b\w+\b', b_lower))

    if not a_tokens or not b_tokens:
        return 0.0

    # Substring containment (handles "Aaron" in "Aaron Nelson")
    if a_lower in b_lower or b_lower in a_lower:
        # Strong signal but not 1.0 — different lengths
        shorter = min(len(a_lower), len(b_lower))
        longer = max(len(a_lower), len(b_lower))
        return 0.7 + 0.2 * (shorter / longer)

    # Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
    intersection = a_tokens & b_tokens
    union = a_tokens | b_tokens
    jaccard = len(intersection) / len(union)

    # Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
    def is_acronym(short, full):
        if len(short) >= len(full):
            return False
        if not short.isupper():
            short_upper = short.upper()
        else:
            short_upper = short
        full_words = full.split()
        if len(full_words) < 2:
            return False
        first_letters = ''.join(w[0].upper() for w in full_words if w)
        return short_upper == first_letters or short_upper in first_letters

    if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
        return 0.85

    return jaccard


def infer_type(entity_name, summary):
    """
    Light type inference for blocking. Heuristic-based, transparent.
    Returns one of: person, organization, project, place, concept, unknown.

    NOT a precise classification — just enough to avoid obviously wrong
    cross-type comparisons (person vs project). When in doubt, return
    'unknown' which gets compared against everything.
    """
    name_lower = entity_name.lower().strip()
    summary_lower = (summary or "").lower()

    # Person: name patterns
    person_indicators = [
        # First+Last name pattern (two title-cased words, no other tokens)
        bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
        # Single name that's also in the summary as a person
        any(phrase in summary_lower for phrase in [
            'is a person', 'is a professor', 'is an artist', 'is a colleague',
            'is a friend', 'is a family member', 'works at', 'studied at',
            "'s spouse", "'s child", "'s parent", "'s student",
        ]),
    ]
    if any(person_indicators):
        return "person"

    # Organization: company/institution indicators
    org_indicators = [
        any(suffix in name_lower for suffix in [
            ' inc', ' llc', ' corp', ' company', ' university', ' college',
            ' school', ' institute', ' foundation', ' department',
        ]),
        any(phrase in summary_lower for phrase in [
            'is a company', 'is a university', 'is an organization',
            'is an institution', 'is a department', 'is a nonprofit',
        ]),
    ]
    if any(org_indicators):
        return "organization"

    # Project: software/creative work indicators
    project_indicators = [
        any(phrase in summary_lower for phrase in [
            'is a project', 'software project', 'is a codebase',
            'is a tool', 'is a system', 'is an application',
            'is a research project', 'is a design project',
        ]),
        any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
    ]
    if any(project_indicators):
        return "project"

    # Place: location indicators
    place_indicators = [
        any(phrase in summary_lower for phrase in [
            'is a city', 'is a town', 'is a state', 'is a country',
            'is a neighborhood', 'is a region', 'is a location',
        ]),
    ]
    if any(place_indicators):
        return "place"

    # Default
    return "unknown"


def get_neighbors(graph, entity_uuid, limit=20):
    """Get the names of entities connected to this entity (1-hop)."""
    query = """
    MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
    RETURN DISTINCT other.name AS name
    LIMIT $limit
    """
    result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
    return set(row[0] for row in result.result_set if row[0])


def neighbor_jaccard(neighbors_a, neighbors_b):
    """Jaccard similarity of two neighbor sets."""
    if not neighbors_a and not neighbors_b:
        return 0.0
    intersection = neighbors_a & neighbors_b
    union = neighbors_a | neighbors_b
    if not union:
        return 0.0
    return len(intersection) / len(union)


def get_edge_count(graph, entity_uuid):
    query = """
    MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
    RETURN count(r) AS c
    """
    result = graph.query(query, {"uuid": entity_uuid})
    return result.result_set[0][0] if result.result_set else 0


def combine_signals(name_sim, ego_sim, neighbor_sim):
    """
    Combine the three similarity signals into a single confidence score.

    Weighting based on DEG-RAG findings: ego info is essential, neighbor
    cues help in some settings, name similarity is a strong tie-breaker
    but not the primary signal.

    For 0.1, simple weighted average with floor based on ego_sim alone.
    """
    # If ego similarity is very low, the entities probably aren't aliases
    # regardless of name match (different concepts can share names)
    if ego_sim < 0.4:
        return min(0.5, ego_sim)

    # If name is very similar AND ego is at least moderate, high confidence
    if name_sim >= 0.85 and ego_sim >= 0.65:
        return 0.5 * ego_sim + 0.3 * name_sim + 0.2 * neighbor_sim

    # Standard weighted average
    return 0.5 * ego_sim + 0.25 * name_sim + 0.25 * neighbor_sim


def generate_proposals():
    db = FalkorDB(host='localhost', port=6379)
    graph = db.select_graph(GROUP_ID)

    # Pull all entities with embeddings
    print(f"Fetching entities from group_id '{GROUP_ID}'...")
    result = graph.query("""
    MATCH (n:Entity)
    WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
    RETURN n.uuid, n.name, n.summary, n.name_embedding
    """)

    entities = []
    for row in result.result_set:
        entities.append({
            'uuid': row[0],
            'name': row[1],
            'summary': row[2],
            'embedding': row[3],
        })
    print(f"  Loaded {len(entities)} entities with embeddings")

    # Infer types for blocking
    print("Inferring entity types for blocking...")
    type_counts = defaultdict(int)
    for e in entities:
        e['inferred_type'] = infer_type(e['name'], e['summary'])
        type_counts[e['inferred_type']] += 1
    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
        print(f"  {t}: {c}")

    # Group by inferred type for blocking
    blocks = defaultdict(list)
    for e in entities:
        blocks[e['inferred_type']].append(e)

    # 'unknown' entities get compared against everything (they might be any type)
    # Other types only get compared within their type block + against unknowns
    print()
    print("Comparing entities within type blocks...")
    proposals = []
    low_confidence = []
    comparisons_done = 0

    # Build comparison pairs
    pairs_to_compare = []
    typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
    unknown_block = blocks.get('unknown', [])

    # Within-type pairs (excluding unknown)
    for t, ents in typed_blocks.items():
        for i in range(len(ents)):
            for j in range(i + 1, len(ents)):
                pairs_to_compare.append((ents[i], ents[j]))

    # Unknown vs unknown
    for i in range(len(unknown_block)):
        for j in range(i + 1, len(unknown_block)):
            pairs_to_compare.append((unknown_block[i], unknown_block[j]))

    # Unknown vs typed (unknowns might be any type)
    for ent_unknown in unknown_block:
        for t, ents in typed_blocks.items():
            for ent_typed in ents:
                pairs_to_compare.append((ent_unknown, ent_typed))

    print(f"  Pairs to compare: {len(pairs_to_compare):,}")

    # Compute similarities
    cache_neighbors = {}
    def neighbors_cached(uuid):
        if uuid not in cache_neighbors:
            cache_neighbors[uuid] = get_neighbors(graph, uuid)
        return cache_neighbors[uuid]

    for ent_a, ent_b in pairs_to_compare:
        comparisons_done += 1
        if comparisons_done % 5000 == 0:
            print(f"  ... {comparisons_done:,} / {len(pairs_to_compare):,}")

        # Quick filter: skip if name similarity is very low and names are clearly different
        name_sim = name_similarity(ent_a['name'], ent_b['name'])
        ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])

        # Pre-filter to avoid expensive neighbor query on obviously different pairs
        if ego_sim_quick < 0.5 and name_sim < 0.3:
            continue

        # Full comparison
        neighbors_a = neighbors_cached(ent_a['uuid'])
        neighbors_b = neighbors_cached(ent_b['uuid'])
        neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)

        confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)

        record = {
            'entity_a': {
                'uuid': ent_a['uuid'],
                'name': ent_a['name'],
                'type': ent_a['inferred_type'],
                'summary': ent_a['summary'][:200],
                'edge_count': get_edge_count(graph, ent_a['uuid']),
            },
            'entity_b': {
                'uuid': ent_b['uuid'],
                'name': ent_b['name'],
                'type': ent_b['inferred_type'],
                'summary': ent_b['summary'][:200],
                'edge_count': get_edge_count(graph, ent_b['uuid']),
            },
            'confidence': round(confidence, 3),
            'signals': {
                'name_similarity': round(name_sim, 3),
                'ego_similarity': round(ego_sim_quick, 3),
                'neighbor_overlap': round(neighbor_sim, 3),
            },
            'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
        }

        if confidence >= HIGH_CONFIDENCE_THRESHOLD:
            proposals.append(record)
        elif confidence >= LOW_CONFIDENCE_THRESHOLD:
            low_confidence.append(record)

    print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
    return proposals, low_confidence, len(entities), len(pairs_to_compare)


def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
    out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"

    proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
    low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])

    lines = []
    lines.append(f"# Consolidator 0.1 — Run {timestamp}")
    lines.append("")
    lines.append("## Statistics")
    lines.append(f"- Entities scanned: {total_entities:,}")
    lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
    lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
    lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
    lines.append("")
    lines.append("## How to review")
    lines.append("")
    lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
    lines.append("- `[APPROVE]` — execute this merge on next run")
    lines.append("- `[REJECT]` — don't merge, don't propose again")
    lines.append("- `[DEFER]` — re-surface in next run for further consideration")
    lines.append("")
    lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
    lines.append("")
    lines.append("---")
    lines.append("")
    lines.append(f"## Proposed Merges (n={len(proposals)})")
    lines.append("")

    for i, p in enumerate(proposals_sorted, start=1):
        lines.append(f"### Proposal {i}")
        lines.append("")
        lines.append(f"**Decision:** [ ]")
        lines.append("")
        lines.append(f"**Confidence:** {p['confidence']}")
        lines.append("")
        lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
        lines.append(f"  - uuid: `{p['entity_a']['uuid']}`")
        lines.append(f"  - summary: {p['entity_a']['summary']}")
        lines.append("")
        lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
        lines.append(f"  - uuid: `{p['entity_b']['uuid']}`")
        lines.append(f"  - summary: {p['entity_b']['summary']}")
        lines.append("")
        lines.append(f"**Signals:**")
        lines.append(f"  - Name similarity: {p['signals']['name_similarity']}")
        lines.append(f"  - Ego (summary) similarity: {p['signals']['ego_similarity']}")
        lines.append(f"  - Neighbor overlap: {p['signals']['neighbor_overlap']}")
        if p['shared_neighbors']:
            shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
            lines.append(f"  - Shared neighbors (sample): {shared_str}")
        lines.append("")
        lines.append("**Optional rejection note:** ")
        lines.append("")
        lines.append("---")
        lines.append("")

    lines.append("")
    lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
    lines.append("")
    for p in low_sorted[:30]:
        lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
    if len(low_sorted) > 30:
        lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")

    out_path.write_text("\n".join(lines))
    print(f"\nProposal log written to: {out_path}")

    # Also save raw JSON for downstream tooling
    json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
    with open(json_path, 'w') as f:
        json.dump({
            'run_timestamp': timestamp,
            'statistics': {
                'total_entities': total_entities,
                'total_comparisons': total_comparisons,
                'proposal_count': len(proposals),
                'low_confidence_count': len(low_confidence),
            },
            'proposals': proposals_sorted,
            'low_confidence': low_sorted,
        }, f, indent=2)
    print(f"Raw JSON: {json_path}")


def main():
    print("=" * 70)
    print("Consolidator 0.1 — Calibration Phase")
    print("=" * 70)
    print()

    proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
    write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)

    print()
    print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
    print("for each proposal. Re-run will read decisions and execute approved merges.")


if __name__ == "__main__":
    main()