scripts/: separate production from experimental and deprecated
Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2, base_class, cascade, cost_test, briefing, consistency, token series). Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py, tier1_migration.py — under the bespoke decision both target retired substrate work). Removes 19 .bak* files from disk (gitignored, never tracked; git history is the durable record of every prior version). The 11 production scripts remain in scripts/. All systemd ExecStart paths, api.py subprocess calls, and cron jobs continue to resolve correctly — verified by grep against /etc/systemd/system/aaronai-*.service, scripts/ references in api.py, and the user crontab. Track 1 inventory cross-cutting finding: scripts/ mixed 11 production files with 32 experimental scripts and ~20 .bak files. After this commit a clean-room reader can identify the live workers from a directory listing alone. Found by Track 1 inventory 2026-05-02. See ~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning. After commit, run: 1. git log --oneline -3 — show the new commit on top 2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
This commit is contained in:
@@ -0,0 +1,551 @@
|
||||
"""
|
||||
Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
|
||||
|
||||
Reads entities from FalkorDB group_id 'aaron', infers light type labels,
|
||||
computes pairwise similarity within type blocks using ego summary embedding +
|
||||
name string distance + neighbor pattern overlap, generates merge proposals
|
||||
above threshold, writes proposal log for human review.
|
||||
|
||||
Does NOT execute merges. 0.1 is the calibration phase — proposals only,
|
||||
human reviews before any action.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from falkordb import FalkorDB
|
||||
import numpy as np
|
||||
|
||||
# Configuration
|
||||
GROUP_ID = "aaron"
|
||||
HIGH_CONFIDENCE_THRESHOLD = 0.85 # propose merge above this
|
||||
LOW_CONFIDENCE_THRESHOLD = 0.65 # log as low-confidence below
|
||||
PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
|
||||
PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
"""Cosine similarity between two embedding vectors."""
|
||||
a = np.array(a, dtype=np.float32)
|
||||
b = np.array(b, dtype=np.float32)
|
||||
na = np.linalg.norm(a)
|
||||
nb = np.linalg.norm(b)
|
||||
if na == 0 or nb == 0:
|
||||
return 0.0
|
||||
return float(np.dot(a, b) / (na * nb))
|
||||
|
||||
|
||||
def name_similarity(name_a, name_b):
|
||||
"""
|
||||
Token-overlap-based name similarity.
|
||||
Handles formal/informal pairs (Aaron / Aaron Nelson),
|
||||
abbreviation pairs (HVAMC / Hudson Valley AMC),
|
||||
and simple transcription noise.
|
||||
"""
|
||||
a_lower = name_a.lower().strip()
|
||||
b_lower = name_b.lower().strip()
|
||||
|
||||
if a_lower == b_lower:
|
||||
return 1.0
|
||||
|
||||
# Tokenize
|
||||
a_tokens = set(re.findall(r'\b\w+\b', a_lower))
|
||||
b_tokens = set(re.findall(r'\b\w+\b', b_lower))
|
||||
|
||||
if not a_tokens or not b_tokens:
|
||||
return 0.0
|
||||
|
||||
# Substring containment (handles "Aaron" in "Aaron Nelson")
|
||||
if a_lower in b_lower or b_lower in a_lower:
|
||||
# Strong signal but not 1.0 — different lengths
|
||||
shorter = min(len(a_lower), len(b_lower))
|
||||
longer = max(len(a_lower), len(b_lower))
|
||||
return 0.7 + 0.2 * (shorter / longer)
|
||||
|
||||
# Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
|
||||
intersection = a_tokens & b_tokens
|
||||
union = a_tokens | b_tokens
|
||||
jaccard = len(intersection) / len(union)
|
||||
|
||||
# Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
|
||||
def is_acronym(short, full):
|
||||
if len(short) >= len(full):
|
||||
return False
|
||||
if not short.isupper():
|
||||
short_upper = short.upper()
|
||||
else:
|
||||
short_upper = short
|
||||
full_words = full.split()
|
||||
if len(full_words) < 2:
|
||||
return False
|
||||
first_letters = ''.join(w[0].upper() for w in full_words if w)
|
||||
return short_upper == first_letters or short_upper in first_letters
|
||||
|
||||
if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
|
||||
return 0.85
|
||||
|
||||
return jaccard
|
||||
|
||||
|
||||
def infer_type(entity_name, summary):
|
||||
"""
|
||||
Light type inference for blocking. Heuristic-based, transparent.
|
||||
Returns one of: person, organization, project, place, concept, unknown.
|
||||
|
||||
NOT a precise classification — just enough to avoid obviously wrong
|
||||
cross-type comparisons (person vs project). When in doubt, return
|
||||
'unknown' which gets compared against everything.
|
||||
"""
|
||||
name_lower = entity_name.lower().strip()
|
||||
summary_lower = (summary or "").lower()
|
||||
|
||||
# Person: name patterns
|
||||
person_indicators = [
|
||||
# First+Last name pattern (two title-cased words, no other tokens)
|
||||
bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
|
||||
# Single name that's also in the summary as a person
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a person', 'is a professor', 'is an artist', 'is a colleague',
|
||||
'is a friend', 'is a family member', 'works at', 'studied at',
|
||||
"'s spouse", "'s child", "'s parent", "'s student",
|
||||
]),
|
||||
]
|
||||
if any(person_indicators):
|
||||
return "person"
|
||||
|
||||
# Organization: company/institution indicators
|
||||
org_indicators = [
|
||||
any(suffix in name_lower for suffix in [
|
||||
' inc', ' llc', ' corp', ' company', ' university', ' college',
|
||||
' school', ' institute', ' foundation', ' department',
|
||||
]),
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a company', 'is a university', 'is an organization',
|
||||
'is an institution', 'is a department', 'is a nonprofit',
|
||||
]),
|
||||
]
|
||||
if any(org_indicators):
|
||||
return "organization"
|
||||
|
||||
# Project: software/creative work indicators
|
||||
project_indicators = [
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a project', 'software project', 'is a codebase',
|
||||
'is a tool', 'is a system', 'is an application',
|
||||
'is a research project', 'is a design project',
|
||||
]),
|
||||
any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
|
||||
]
|
||||
if any(project_indicators):
|
||||
return "project"
|
||||
|
||||
# Place: location indicators
|
||||
place_indicators = [
|
||||
any(phrase in summary_lower for phrase in [
|
||||
'is a city', 'is a town', 'is a state', 'is a country',
|
||||
'is a neighborhood', 'is a region', 'is a location',
|
||||
]),
|
||||
]
|
||||
if any(place_indicators):
|
||||
return "place"
|
||||
|
||||
# Default
|
||||
return "unknown"
|
||||
|
||||
|
||||
def get_neighbors(graph, entity_uuid, limit=20):
|
||||
"""Get the names of entities connected to this entity (1-hop)."""
|
||||
query = """
|
||||
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
|
||||
RETURN DISTINCT other.name AS name
|
||||
LIMIT $limit
|
||||
"""
|
||||
result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
|
||||
return set(row[0] for row in result.result_set if row[0])
|
||||
|
||||
|
||||
def neighbor_jaccard(neighbors_a, neighbors_b):
|
||||
"""
|
||||
Asymmetric neighbor overlap (containment metric).
|
||||
|
||||
Returns |A ∩ B| / min(|A|, |B|) — the fraction of the smaller entity's
|
||||
neighbors that are also neighbors of the larger entity.
|
||||
|
||||
Asymmetric is the right metric for personal cognitive corpora, where
|
||||
one entity (e.g., the user) is a hub with hundreds of edges and alias
|
||||
candidates are smaller subset entities. Jaccard penalizes this
|
||||
asymmetry as if it were dissimilarity; containment reveals it as the
|
||||
subset relationship it is.
|
||||
|
||||
DEG-RAG used Jaccard because their academic-corpus entities are
|
||||
roughly comparable in connectivity. Personal corpora have different
|
||||
topology and need a different metric.
|
||||
"""
|
||||
if not neighbors_a and not neighbors_b:
|
||||
return 0.0
|
||||
intersection = neighbors_a & neighbors_b
|
||||
smaller = min(len(neighbors_a), len(neighbors_b))
|
||||
if smaller == 0:
|
||||
return 0.0
|
||||
return len(intersection) / smaller
|
||||
|
||||
|
||||
def get_edge_count(graph, entity_uuid):
|
||||
query = """
|
||||
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
|
||||
RETURN count(r) AS c
|
||||
"""
|
||||
result = graph.query(query, {"uuid": entity_uuid})
|
||||
return result.result_set[0][0] if result.result_set else 0
|
||||
|
||||
|
||||
def combine_signals(name_sim, ego_sim, neighbor_sim):
|
||||
"""
|
||||
Combine the three similarity signals into a single confidence score.
|
||||
|
||||
Weighting tuned for personal cognitive corpora:
|
||||
- Summary embedding ego similarity is primary signal
|
||||
- Containment-based neighbor overlap is strong secondary (catches Aaron+Nelson
|
||||
where the smaller entity's neighbors are mostly a subset of the hub's)
|
||||
- Name similarity is tie-breaker (handles acronyms via name_similarity helper)
|
||||
|
||||
Different from DEG-RAG defaults because personal corpora have asymmetric
|
||||
topology (hub user, subset alias entities).
|
||||
"""
|
||||
# Strong neighbor containment alone is meaningful — if entity B's neighbors
|
||||
# are mostly contained in entity A's, even with different names and weak
|
||||
# name_embedding similarity, that's the asymmetric alias case (Aaron+Nelson).
|
||||
# Require some ego support but not high.
|
||||
if neighbor_sim >= 0.7 and ego_sim >= 0.3:
|
||||
return 0.4 * neighbor_sim + 0.4 * ego_sim + 0.2 * name_sim
|
||||
|
||||
# If ego is very low AND neighbor overlap is weak, probably not aliases
|
||||
if ego_sim < 0.3 and neighbor_sim < 0.4:
|
||||
return min(0.4, max(ego_sim, neighbor_sim))
|
||||
|
||||
# If name is very similar AND ego is at least moderate, high confidence
|
||||
if name_sim >= 0.85 and ego_sim >= 0.5:
|
||||
return 0.4 * ego_sim + 0.4 * name_sim + 0.2 * neighbor_sim
|
||||
|
||||
# Standard weighted average — ego primary, neighbor and name balanced
|
||||
return 0.45 * ego_sim + 0.3 * neighbor_sim + 0.25 * name_sim
|
||||
|
||||
|
||||
def compute_summary_embedding(text, model="nomic-embed-text"):
|
||||
"""
|
||||
Compute embedding for a summary text via Ollama.
|
||||
|
||||
Used to get ego similarity between entities based on what their summaries
|
||||
say (the actual semantic content) rather than just their names. Aaron's
|
||||
name_embedding and Nelson's name_embedding have low cosine similarity
|
||||
because the names are different tokens. But their summaries describe
|
||||
overlapping content (faculty member at SUNY, HVAMC, etc.) so summary
|
||||
embeddings should produce a much stronger ego signal.
|
||||
"""
|
||||
if not text or len(text) < 10:
|
||||
return None
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:11434/api/embeddings",
|
||||
json={"model": model, "prompt": text[:2000]},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json().get("embedding")
|
||||
except Exception as e:
|
||||
print(f" Embedding error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def precompute_summary_embeddings(entities, model="nomic-embed-text"):
|
||||
"""Compute and cache summary embeddings for all entities."""
|
||||
print(f"Computing summary embeddings via Ollama ({model})...")
|
||||
print(f" Total entities: {len(entities)}")
|
||||
|
||||
cache_path = Path("/home/aaron/aaronai/experiments/summary_embeddings_cache.json")
|
||||
cache = {}
|
||||
if cache_path.exists():
|
||||
with open(cache_path) as f:
|
||||
cache = json.load(f)
|
||||
print(f" Loaded {len(cache)} cached embeddings")
|
||||
|
||||
new_count = 0
|
||||
start = time.time()
|
||||
for i, e in enumerate(entities):
|
||||
if e["uuid"] in cache:
|
||||
e["summary_embedding"] = cache[e["uuid"]]
|
||||
continue
|
||||
emb = compute_summary_embedding(e["summary"], model=model)
|
||||
if emb:
|
||||
e["summary_embedding"] = emb
|
||||
cache[e["uuid"]] = emb
|
||||
new_count += 1
|
||||
else:
|
||||
e["summary_embedding"] = None
|
||||
|
||||
# Save cache periodically
|
||||
if new_count > 0 and new_count % 100 == 0:
|
||||
with open(cache_path, "w") as f:
|
||||
json.dump(cache, f)
|
||||
elapsed = time.time() - start
|
||||
rate = new_count / elapsed
|
||||
remaining = (len(entities) - i - 1) / rate if rate > 0 else 0
|
||||
print(f" ... {i+1}/{len(entities)} (computed {new_count} new, ~{remaining:.0f}s remaining)")
|
||||
|
||||
# Final save
|
||||
with open(cache_path, "w") as f:
|
||||
json.dump(cache, f)
|
||||
|
||||
have_embeddings = sum(1 for e in entities if e.get("summary_embedding"))
|
||||
print(f" Done. {have_embeddings}/{len(entities)} entities have summary embeddings")
|
||||
|
||||
|
||||
def generate_proposals():
|
||||
db = FalkorDB(host='localhost', port=6379)
|
||||
graph = db.select_graph(GROUP_ID)
|
||||
|
||||
# Pull all entities with embeddings
|
||||
print(f"Fetching entities from group_id '{GROUP_ID}'...")
|
||||
result = graph.query("""
|
||||
MATCH (n:Entity)
|
||||
WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
|
||||
RETURN n.uuid, n.name, n.summary, n.name_embedding
|
||||
""")
|
||||
|
||||
entities = []
|
||||
for row in result.result_set:
|
||||
entities.append({
|
||||
'uuid': row[0],
|
||||
'name': row[1],
|
||||
'summary': row[2],
|
||||
'embedding': row[3],
|
||||
})
|
||||
print(f" Loaded {len(entities)} entities with embeddings")
|
||||
|
||||
# Compute summary embeddings (true ego signal, beyond name embeddings)
|
||||
precompute_summary_embeddings(entities)
|
||||
|
||||
# Infer types for blocking
|
||||
print("Inferring entity types for blocking...")
|
||||
type_counts = defaultdict(int)
|
||||
for e in entities:
|
||||
e['inferred_type'] = infer_type(e['name'], e['summary'])
|
||||
type_counts[e['inferred_type']] += 1
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t}: {c}")
|
||||
|
||||
# Group by inferred type for blocking
|
||||
blocks = defaultdict(list)
|
||||
for e in entities:
|
||||
blocks[e['inferred_type']].append(e)
|
||||
|
||||
# 'unknown' entities get compared against everything (they might be any type)
|
||||
# Other types only get compared within their type block + against unknowns
|
||||
print()
|
||||
print("Comparing entities within type blocks...")
|
||||
proposals = []
|
||||
low_confidence = []
|
||||
comparisons_done = 0
|
||||
|
||||
# Build comparison pairs
|
||||
pairs_to_compare = []
|
||||
typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
|
||||
unknown_block = blocks.get('unknown', [])
|
||||
|
||||
# Within-type pairs (excluding unknown)
|
||||
for t, ents in typed_blocks.items():
|
||||
for i in range(len(ents)):
|
||||
for j in range(i + 1, len(ents)):
|
||||
pairs_to_compare.append((ents[i], ents[j]))
|
||||
|
||||
# Unknown vs unknown
|
||||
for i in range(len(unknown_block)):
|
||||
for j in range(i + 1, len(unknown_block)):
|
||||
pairs_to_compare.append((unknown_block[i], unknown_block[j]))
|
||||
|
||||
# Unknown vs typed (unknowns might be any type)
|
||||
for ent_unknown in unknown_block:
|
||||
for t, ents in typed_blocks.items():
|
||||
for ent_typed in ents:
|
||||
pairs_to_compare.append((ent_unknown, ent_typed))
|
||||
|
||||
print(f" Pairs to compare: {len(pairs_to_compare):,}")
|
||||
|
||||
# Compute similarities
|
||||
cache_neighbors = {}
|
||||
def neighbors_cached(uuid):
|
||||
if uuid not in cache_neighbors:
|
||||
cache_neighbors[uuid] = get_neighbors(graph, uuid)
|
||||
return cache_neighbors[uuid]
|
||||
|
||||
for ent_a, ent_b in pairs_to_compare:
|
||||
comparisons_done += 1
|
||||
if comparisons_done % 5000 == 0:
|
||||
print(f" ... {comparisons_done:,} / {len(pairs_to_compare):,}")
|
||||
|
||||
# Compute name similarity (handles formal/informal pairs, acronyms)
|
||||
name_sim = name_similarity(ent_a['name'], ent_b['name'])
|
||||
|
||||
# Compute ego similarity using SUMMARY embeddings (the actual semantic
|
||||
# content), falling back to name embeddings if summaries unavailable.
|
||||
# Summary similarity catches Aaron+Nelson where name similarity fails.
|
||||
if ent_a.get('summary_embedding') and ent_b.get('summary_embedding'):
|
||||
ego_sim_quick = cosine_similarity(ent_a['summary_embedding'], ent_b['summary_embedding'])
|
||||
else:
|
||||
ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
|
||||
|
||||
# Pre-filter to avoid expensive neighbor query on obviously different pairs.
|
||||
# Lowered thresholds vs DEG-RAG defaults because personal-corpus aliases often
|
||||
# have low name_embedding similarity (different surface tokens) but high
|
||||
# neighbor overlap. We let weaker name/ego signals through to the neighbor
|
||||
# check, which can rescue them via containment metric.
|
||||
if ego_sim_quick < 0.3 and name_sim < 0.15:
|
||||
continue
|
||||
|
||||
# Full comparison
|
||||
neighbors_a = neighbors_cached(ent_a['uuid'])
|
||||
neighbors_b = neighbors_cached(ent_b['uuid'])
|
||||
neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
|
||||
|
||||
confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
|
||||
|
||||
record = {
|
||||
'entity_a': {
|
||||
'uuid': ent_a['uuid'],
|
||||
'name': ent_a['name'],
|
||||
'type': ent_a['inferred_type'],
|
||||
'summary': ent_a['summary'][:200],
|
||||
'edge_count': get_edge_count(graph, ent_a['uuid']),
|
||||
},
|
||||
'entity_b': {
|
||||
'uuid': ent_b['uuid'],
|
||||
'name': ent_b['name'],
|
||||
'type': ent_b['inferred_type'],
|
||||
'summary': ent_b['summary'][:200],
|
||||
'edge_count': get_edge_count(graph, ent_b['uuid']),
|
||||
},
|
||||
'confidence': round(confidence, 3),
|
||||
'signals': {
|
||||
'name_similarity': round(name_sim, 3),
|
||||
'ego_similarity': round(ego_sim_quick, 3),
|
||||
'neighbor_overlap': round(neighbor_sim, 3),
|
||||
},
|
||||
'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
|
||||
}
|
||||
|
||||
if confidence >= HIGH_CONFIDENCE_THRESHOLD:
|
||||
proposals.append(record)
|
||||
elif confidence >= LOW_CONFIDENCE_THRESHOLD:
|
||||
low_confidence.append(record)
|
||||
|
||||
print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
|
||||
return proposals, low_confidence, len(entities), len(pairs_to_compare)
|
||||
|
||||
|
||||
def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
|
||||
out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
|
||||
|
||||
proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
|
||||
low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
|
||||
|
||||
lines = []
|
||||
lines.append(f"# Consolidator 0.1 — Run {timestamp}")
|
||||
lines.append("")
|
||||
lines.append("## Statistics")
|
||||
lines.append(f"- Entities scanned: {total_entities:,}")
|
||||
lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
|
||||
lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
|
||||
lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
|
||||
lines.append("")
|
||||
lines.append("## How to review")
|
||||
lines.append("")
|
||||
lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
|
||||
lines.append("- `[APPROVE]` — execute this merge on next run")
|
||||
lines.append("- `[REJECT]` — don't merge, don't propose again")
|
||||
lines.append("- `[DEFER]` — re-surface in next run for further consideration")
|
||||
lines.append("")
|
||||
lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
lines.append(f"## Proposed Merges (n={len(proposals)})")
|
||||
lines.append("")
|
||||
|
||||
for i, p in enumerate(proposals_sorted, start=1):
|
||||
lines.append(f"### Proposal {i}")
|
||||
lines.append("")
|
||||
lines.append(f"**Decision:** [ ]")
|
||||
lines.append("")
|
||||
lines.append(f"**Confidence:** {p['confidence']}")
|
||||
lines.append("")
|
||||
lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
|
||||
lines.append(f" - uuid: `{p['entity_a']['uuid']}`")
|
||||
lines.append(f" - summary: {p['entity_a']['summary']}")
|
||||
lines.append("")
|
||||
lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
|
||||
lines.append(f" - uuid: `{p['entity_b']['uuid']}`")
|
||||
lines.append(f" - summary: {p['entity_b']['summary']}")
|
||||
lines.append("")
|
||||
lines.append(f"**Signals:**")
|
||||
lines.append(f" - Name similarity: {p['signals']['name_similarity']}")
|
||||
lines.append(f" - Ego (summary) similarity: {p['signals']['ego_similarity']}")
|
||||
lines.append(f" - Neighbor overlap: {p['signals']['neighbor_overlap']}")
|
||||
if p['shared_neighbors']:
|
||||
shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
|
||||
lines.append(f" - Shared neighbors (sample): {shared_str}")
|
||||
lines.append("")
|
||||
lines.append("**Optional rejection note:** ")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
lines.append("")
|
||||
lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
|
||||
lines.append("")
|
||||
for p in low_sorted[:30]:
|
||||
lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
|
||||
if len(low_sorted) > 30:
|
||||
lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
|
||||
|
||||
out_path.write_text("\n".join(lines))
|
||||
print(f"\nProposal log written to: {out_path}")
|
||||
|
||||
# Also save raw JSON for downstream tooling
|
||||
json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump({
|
||||
'run_timestamp': timestamp,
|
||||
'statistics': {
|
||||
'total_entities': total_entities,
|
||||
'total_comparisons': total_comparisons,
|
||||
'proposal_count': len(proposals),
|
||||
'low_confidence_count': len(low_confidence),
|
||||
},
|
||||
'proposals': proposals_sorted,
|
||||
'low_confidence': low_sorted,
|
||||
}, f, indent=2)
|
||||
print(f"Raw JSON: {json_path}")
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("Consolidator 0.1 — Calibration Phase")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
|
||||
write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
|
||||
|
||||
print()
|
||||
print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
|
||||
print("for each proposal. Re-run will read decisions and execute approved merges.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user