465f2f725b
- api.py: strip CV pinning workaround (parity violation, see architecture doc) - dream.py: F1 — retrieve_graphiti() now accepts excluded_sources, over-fetches 3x and filters in-process. Was silently dropping the parameter; would have confounded E3 with broken cross-stage exclusion in Graphiti arm. - watcher.py + ingest.py: F14 — drop full_text[:50000] truncation. Was propagating through entire cascade. Postgres TEXT can hold up to 1GB. - corpus_integrity.py: F37 — same truncation, third path now clean. Backups: api.py.bak.*, dream.py.bak.*, watcher.py.bak.*, ingest.py.bak.*, corpus_integrity.py.bak.* timestamped pre-fix. Re-cascaded Shop Class as Soulcraft (only already-cascaded source affected by F14, 414KB).
443 lines
16 KiB
Plaintext
443 lines
16 KiB
Plaintext
"""
|
|
Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
|
|
|
|
Reads entities from FalkorDB group_id 'aaron', infers light type labels,
|
|
computes pairwise similarity within type blocks using ego summary embedding +
|
|
name string distance + neighbor pattern overlap, generates merge proposals
|
|
above threshold, writes proposal log for human review.
|
|
|
|
Does NOT execute merges. 0.1 is the calibration phase — proposals only,
|
|
human reviews before any action.
|
|
"""
|
|
import json
|
|
import re
|
|
import os
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
from falkordb import FalkorDB
|
|
import numpy as np
|
|
|
|
# Configuration
|
|
GROUP_ID = "aaron"
|
|
HIGH_CONFIDENCE_THRESHOLD = 0.85 # propose merge above this
|
|
LOW_CONFIDENCE_THRESHOLD = 0.65 # log as low-confidence below
|
|
PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
|
|
PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def cosine_similarity(a, b):
|
|
"""Cosine similarity between two embedding vectors."""
|
|
a = np.array(a, dtype=np.float32)
|
|
b = np.array(b, dtype=np.float32)
|
|
na = np.linalg.norm(a)
|
|
nb = np.linalg.norm(b)
|
|
if na == 0 or nb == 0:
|
|
return 0.0
|
|
return float(np.dot(a, b) / (na * nb))
|
|
|
|
|
|
def name_similarity(name_a, name_b):
|
|
"""
|
|
Token-overlap-based name similarity.
|
|
Handles formal/informal pairs (Aaron / Aaron Nelson),
|
|
abbreviation pairs (HVAMC / Hudson Valley AMC),
|
|
and simple transcription noise.
|
|
"""
|
|
a_lower = name_a.lower().strip()
|
|
b_lower = name_b.lower().strip()
|
|
|
|
if a_lower == b_lower:
|
|
return 1.0
|
|
|
|
# Tokenize
|
|
a_tokens = set(re.findall(r'\b\w+\b', a_lower))
|
|
b_tokens = set(re.findall(r'\b\w+\b', b_lower))
|
|
|
|
if not a_tokens or not b_tokens:
|
|
return 0.0
|
|
|
|
# Substring containment (handles "Aaron" in "Aaron Nelson")
|
|
if a_lower in b_lower or b_lower in a_lower:
|
|
# Strong signal but not 1.0 — different lengths
|
|
shorter = min(len(a_lower), len(b_lower))
|
|
longer = max(len(a_lower), len(b_lower))
|
|
return 0.7 + 0.2 * (shorter / longer)
|
|
|
|
# Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
|
|
intersection = a_tokens & b_tokens
|
|
union = a_tokens | b_tokens
|
|
jaccard = len(intersection) / len(union)
|
|
|
|
# Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
|
|
def is_acronym(short, full):
|
|
if len(short) >= len(full):
|
|
return False
|
|
if not short.isupper():
|
|
short_upper = short.upper()
|
|
else:
|
|
short_upper = short
|
|
full_words = full.split()
|
|
if len(full_words) < 2:
|
|
return False
|
|
first_letters = ''.join(w[0].upper() for w in full_words if w)
|
|
return short_upper == first_letters or short_upper in first_letters
|
|
|
|
if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
|
|
return 0.85
|
|
|
|
return jaccard
|
|
|
|
|
|
def infer_type(entity_name, summary):
|
|
"""
|
|
Light type inference for blocking. Heuristic-based, transparent.
|
|
Returns one of: person, organization, project, place, concept, unknown.
|
|
|
|
NOT a precise classification — just enough to avoid obviously wrong
|
|
cross-type comparisons (person vs project). When in doubt, return
|
|
'unknown' which gets compared against everything.
|
|
"""
|
|
name_lower = entity_name.lower().strip()
|
|
summary_lower = (summary or "").lower()
|
|
|
|
# Person: name patterns
|
|
person_indicators = [
|
|
# First+Last name pattern (two title-cased words, no other tokens)
|
|
bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
|
|
# Single name that's also in the summary as a person
|
|
any(phrase in summary_lower for phrase in [
|
|
'is a person', 'is a professor', 'is an artist', 'is a colleague',
|
|
'is a friend', 'is a family member', 'works at', 'studied at',
|
|
"'s spouse", "'s child", "'s parent", "'s student",
|
|
]),
|
|
]
|
|
if any(person_indicators):
|
|
return "person"
|
|
|
|
# Organization: company/institution indicators
|
|
org_indicators = [
|
|
any(suffix in name_lower for suffix in [
|
|
' inc', ' llc', ' corp', ' company', ' university', ' college',
|
|
' school', ' institute', ' foundation', ' department',
|
|
]),
|
|
any(phrase in summary_lower for phrase in [
|
|
'is a company', 'is a university', 'is an organization',
|
|
'is an institution', 'is a department', 'is a nonprofit',
|
|
]),
|
|
]
|
|
if any(org_indicators):
|
|
return "organization"
|
|
|
|
# Project: software/creative work indicators
|
|
project_indicators = [
|
|
any(phrase in summary_lower for phrase in [
|
|
'is a project', 'software project', 'is a codebase',
|
|
'is a tool', 'is a system', 'is an application',
|
|
'is a research project', 'is a design project',
|
|
]),
|
|
any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
|
|
]
|
|
if any(project_indicators):
|
|
return "project"
|
|
|
|
# Place: location indicators
|
|
place_indicators = [
|
|
any(phrase in summary_lower for phrase in [
|
|
'is a city', 'is a town', 'is a state', 'is a country',
|
|
'is a neighborhood', 'is a region', 'is a location',
|
|
]),
|
|
]
|
|
if any(place_indicators):
|
|
return "place"
|
|
|
|
# Default
|
|
return "unknown"
|
|
|
|
|
|
def get_neighbors(graph, entity_uuid, limit=20):
|
|
"""Get the names of entities connected to this entity (1-hop)."""
|
|
query = """
|
|
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
|
|
RETURN DISTINCT other.name AS name
|
|
LIMIT $limit
|
|
"""
|
|
result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
|
|
return set(row[0] for row in result.result_set if row[0])
|
|
|
|
|
|
def neighbor_jaccard(neighbors_a, neighbors_b):
|
|
"""Jaccard similarity of two neighbor sets."""
|
|
if not neighbors_a and not neighbors_b:
|
|
return 0.0
|
|
intersection = neighbors_a & neighbors_b
|
|
union = neighbors_a | neighbors_b
|
|
if not union:
|
|
return 0.0
|
|
return len(intersection) / len(union)
|
|
|
|
|
|
def get_edge_count(graph, entity_uuid):
|
|
query = """
|
|
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
|
|
RETURN count(r) AS c
|
|
"""
|
|
result = graph.query(query, {"uuid": entity_uuid})
|
|
return result.result_set[0][0] if result.result_set else 0
|
|
|
|
|
|
def combine_signals(name_sim, ego_sim, neighbor_sim):
|
|
"""
|
|
Combine the three similarity signals into a single confidence score.
|
|
|
|
Weighting based on DEG-RAG findings: ego info is essential, neighbor
|
|
cues help in some settings, name similarity is a strong tie-breaker
|
|
but not the primary signal.
|
|
|
|
For 0.1, simple weighted average with floor based on ego_sim alone.
|
|
"""
|
|
# If ego similarity is very low, the entities probably aren't aliases
|
|
# regardless of name match (different concepts can share names)
|
|
if ego_sim < 0.4:
|
|
return min(0.5, ego_sim)
|
|
|
|
# If name is very similar AND ego is at least moderate, high confidence
|
|
if name_sim >= 0.85 and ego_sim >= 0.65:
|
|
return 0.5 * ego_sim + 0.3 * name_sim + 0.2 * neighbor_sim
|
|
|
|
# Standard weighted average
|
|
return 0.5 * ego_sim + 0.25 * name_sim + 0.25 * neighbor_sim
|
|
|
|
|
|
def generate_proposals():
|
|
db = FalkorDB(host='localhost', port=6379)
|
|
graph = db.select_graph(GROUP_ID)
|
|
|
|
# Pull all entities with embeddings
|
|
print(f"Fetching entities from group_id '{GROUP_ID}'...")
|
|
result = graph.query("""
|
|
MATCH (n:Entity)
|
|
WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
|
|
RETURN n.uuid, n.name, n.summary, n.name_embedding
|
|
""")
|
|
|
|
entities = []
|
|
for row in result.result_set:
|
|
entities.append({
|
|
'uuid': row[0],
|
|
'name': row[1],
|
|
'summary': row[2],
|
|
'embedding': row[3],
|
|
})
|
|
print(f" Loaded {len(entities)} entities with embeddings")
|
|
|
|
# Infer types for blocking
|
|
print("Inferring entity types for blocking...")
|
|
type_counts = defaultdict(int)
|
|
for e in entities:
|
|
e['inferred_type'] = infer_type(e['name'], e['summary'])
|
|
type_counts[e['inferred_type']] += 1
|
|
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {t}: {c}")
|
|
|
|
# Group by inferred type for blocking
|
|
blocks = defaultdict(list)
|
|
for e in entities:
|
|
blocks[e['inferred_type']].append(e)
|
|
|
|
# 'unknown' entities get compared against everything (they might be any type)
|
|
# Other types only get compared within their type block + against unknowns
|
|
print()
|
|
print("Comparing entities within type blocks...")
|
|
proposals = []
|
|
low_confidence = []
|
|
comparisons_done = 0
|
|
|
|
# Build comparison pairs
|
|
pairs_to_compare = []
|
|
typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
|
|
unknown_block = blocks.get('unknown', [])
|
|
|
|
# Within-type pairs (excluding unknown)
|
|
for t, ents in typed_blocks.items():
|
|
for i in range(len(ents)):
|
|
for j in range(i + 1, len(ents)):
|
|
pairs_to_compare.append((ents[i], ents[j]))
|
|
|
|
# Unknown vs unknown
|
|
for i in range(len(unknown_block)):
|
|
for j in range(i + 1, len(unknown_block)):
|
|
pairs_to_compare.append((unknown_block[i], unknown_block[j]))
|
|
|
|
# Unknown vs typed (unknowns might be any type)
|
|
for ent_unknown in unknown_block:
|
|
for t, ents in typed_blocks.items():
|
|
for ent_typed in ents:
|
|
pairs_to_compare.append((ent_unknown, ent_typed))
|
|
|
|
print(f" Pairs to compare: {len(pairs_to_compare):,}")
|
|
|
|
# Compute similarities
|
|
cache_neighbors = {}
|
|
def neighbors_cached(uuid):
|
|
if uuid not in cache_neighbors:
|
|
cache_neighbors[uuid] = get_neighbors(graph, uuid)
|
|
return cache_neighbors[uuid]
|
|
|
|
for ent_a, ent_b in pairs_to_compare:
|
|
comparisons_done += 1
|
|
if comparisons_done % 5000 == 0:
|
|
print(f" ... {comparisons_done:,} / {len(pairs_to_compare):,}")
|
|
|
|
# Quick filter: skip if name similarity is very low and names are clearly different
|
|
name_sim = name_similarity(ent_a['name'], ent_b['name'])
|
|
ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
|
|
|
|
# Pre-filter to avoid expensive neighbor query on obviously different pairs
|
|
if ego_sim_quick < 0.5 and name_sim < 0.3:
|
|
continue
|
|
|
|
# Full comparison
|
|
neighbors_a = neighbors_cached(ent_a['uuid'])
|
|
neighbors_b = neighbors_cached(ent_b['uuid'])
|
|
neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
|
|
|
|
confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
|
|
|
|
record = {
|
|
'entity_a': {
|
|
'uuid': ent_a['uuid'],
|
|
'name': ent_a['name'],
|
|
'type': ent_a['inferred_type'],
|
|
'summary': ent_a['summary'][:200],
|
|
'edge_count': get_edge_count(graph, ent_a['uuid']),
|
|
},
|
|
'entity_b': {
|
|
'uuid': ent_b['uuid'],
|
|
'name': ent_b['name'],
|
|
'type': ent_b['inferred_type'],
|
|
'summary': ent_b['summary'][:200],
|
|
'edge_count': get_edge_count(graph, ent_b['uuid']),
|
|
},
|
|
'confidence': round(confidence, 3),
|
|
'signals': {
|
|
'name_similarity': round(name_sim, 3),
|
|
'ego_similarity': round(ego_sim_quick, 3),
|
|
'neighbor_overlap': round(neighbor_sim, 3),
|
|
},
|
|
'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
|
|
}
|
|
|
|
if confidence >= HIGH_CONFIDENCE_THRESHOLD:
|
|
proposals.append(record)
|
|
elif confidence >= LOW_CONFIDENCE_THRESHOLD:
|
|
low_confidence.append(record)
|
|
|
|
print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
|
|
return proposals, low_confidence, len(entities), len(pairs_to_compare)
|
|
|
|
|
|
def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
|
|
out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
|
|
|
|
proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
|
|
low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
|
|
|
|
lines = []
|
|
lines.append(f"# Consolidator 0.1 — Run {timestamp}")
|
|
lines.append("")
|
|
lines.append("## Statistics")
|
|
lines.append(f"- Entities scanned: {total_entities:,}")
|
|
lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
|
|
lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
|
|
lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
|
|
lines.append("")
|
|
lines.append("## How to review")
|
|
lines.append("")
|
|
lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
|
|
lines.append("- `[APPROVE]` — execute this merge on next run")
|
|
lines.append("- `[REJECT]` — don't merge, don't propose again")
|
|
lines.append("- `[DEFER]` — re-surface in next run for further consideration")
|
|
lines.append("")
|
|
lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
lines.append(f"## Proposed Merges (n={len(proposals)})")
|
|
lines.append("")
|
|
|
|
for i, p in enumerate(proposals_sorted, start=1):
|
|
lines.append(f"### Proposal {i}")
|
|
lines.append("")
|
|
lines.append(f"**Decision:** [ ]")
|
|
lines.append("")
|
|
lines.append(f"**Confidence:** {p['confidence']}")
|
|
lines.append("")
|
|
lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
|
|
lines.append(f" - uuid: `{p['entity_a']['uuid']}`")
|
|
lines.append(f" - summary: {p['entity_a']['summary']}")
|
|
lines.append("")
|
|
lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
|
|
lines.append(f" - uuid: `{p['entity_b']['uuid']}`")
|
|
lines.append(f" - summary: {p['entity_b']['summary']}")
|
|
lines.append("")
|
|
lines.append(f"**Signals:**")
|
|
lines.append(f" - Name similarity: {p['signals']['name_similarity']}")
|
|
lines.append(f" - Ego (summary) similarity: {p['signals']['ego_similarity']}")
|
|
lines.append(f" - Neighbor overlap: {p['signals']['neighbor_overlap']}")
|
|
if p['shared_neighbors']:
|
|
shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
|
|
lines.append(f" - Shared neighbors (sample): {shared_str}")
|
|
lines.append("")
|
|
lines.append("**Optional rejection note:** ")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
lines.append("")
|
|
lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
|
|
lines.append("")
|
|
for p in low_sorted[:30]:
|
|
lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
|
|
if len(low_sorted) > 30:
|
|
lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
|
|
|
|
out_path.write_text("\n".join(lines))
|
|
print(f"\nProposal log written to: {out_path}")
|
|
|
|
# Also save raw JSON for downstream tooling
|
|
json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
|
|
with open(json_path, 'w') as f:
|
|
json.dump({
|
|
'run_timestamp': timestamp,
|
|
'statistics': {
|
|
'total_entities': total_entities,
|
|
'total_comparisons': total_comparisons,
|
|
'proposal_count': len(proposals),
|
|
'low_confidence_count': len(low_confidence),
|
|
},
|
|
'proposals': proposals_sorted,
|
|
'low_confidence': low_sorted,
|
|
}, f, indent=2)
|
|
print(f"Raw JSON: {json_path}")
|
|
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Consolidator 0.1 — Calibration Phase")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
|
|
write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
|
|
|
|
print()
|
|
print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
|
|
print("for each proposal. Re-run will read decisions and execute approved merges.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|