scripts/: separate production from experimental and deprecated

Moves 28 experiment scripts to scripts/experiments/ (E1, E1.4, E1.6, E2,
base_class, cascade, cost_test, briefing, consistency, token series).
Moves 2 dissolved-layer scripts to scripts/deprecated/ (consolidator_v0_1.py,
tier1_migration.py — under the bespoke decision both target retired
substrate work).
Removes 19 .bak* files from disk (gitignored, never tracked; git history
is the durable record of every prior version).

The 11 production scripts remain in scripts/. All systemd ExecStart paths,
api.py subprocess calls, and cron jobs continue to resolve correctly —
verified by grep against /etc/systemd/system/aaronai-*.service, scripts/
references in api.py, and the user crontab.

Track 1 inventory cross-cutting finding: scripts/ mixed 11 production
files with 32 experimental scripts and ~20 .bak files. After this commit
a clean-room reader can identify the live workers from a directory listing
alone.

Found by Track 1 inventory 2026-05-02. See
~/aaronai/docs/scripts-reorg-plan-2026-05-02.md for full reasoning.

After commit, run:
1. git log --oneline -3 — show the new commit on top
2. git status — confirm clean working tree (modulo the docs/ untracked files which are intentional)
This commit is contained in:
2026-05-02 23:28:24 +00:00
parent 6f2d274d5d
commit 3f7fba7e0e
30 changed files with 0 additions and 0 deletions
+551
View File
@@ -0,0 +1,551 @@
"""
Consolidator 0.1 — alias resolution agent for BirdAI's Tier 1 substrate.
Reads entities from FalkorDB group_id 'aaron', infers light type labels,
computes pairwise similarity within type blocks using ego summary embedding +
name string distance + neighbor pattern overlap, generates merge proposals
above threshold, writes proposal log for human review.
Does NOT execute merges. 0.1 is the calibration phase — proposals only,
human reviews before any action.
"""
import json
import re
import os
import time
from datetime import datetime, timezone
from collections import defaultdict
from pathlib import Path
import requests
from falkordb import FalkorDB
import numpy as np
# Configuration
GROUP_ID = "aaron"
HIGH_CONFIDENCE_THRESHOLD = 0.85 # propose merge above this
LOW_CONFIDENCE_THRESHOLD = 0.65 # log as low-confidence below
PROPOSALS_DIR = Path("/home/aaron/Nextcloud/Journal/Consolidation")
PROPOSALS_DIR.mkdir(parents=True, exist_ok=True)
def cosine_similarity(a, b):
"""Cosine similarity between two embedding vectors."""
a = np.array(a, dtype=np.float32)
b = np.array(b, dtype=np.float32)
na = np.linalg.norm(a)
nb = np.linalg.norm(b)
if na == 0 or nb == 0:
return 0.0
return float(np.dot(a, b) / (na * nb))
def name_similarity(name_a, name_b):
"""
Token-overlap-based name similarity.
Handles formal/informal pairs (Aaron / Aaron Nelson),
abbreviation pairs (HVAMC / Hudson Valley AMC),
and simple transcription noise.
"""
a_lower = name_a.lower().strip()
b_lower = name_b.lower().strip()
if a_lower == b_lower:
return 1.0
# Tokenize
a_tokens = set(re.findall(r'\b\w+\b', a_lower))
b_tokens = set(re.findall(r'\b\w+\b', b_lower))
if not a_tokens or not b_tokens:
return 0.0
# Substring containment (handles "Aaron" in "Aaron Nelson")
if a_lower in b_lower or b_lower in a_lower:
# Strong signal but not 1.0 — different lengths
shorter = min(len(a_lower), len(b_lower))
longer = max(len(a_lower), len(b_lower))
return 0.7 + 0.2 * (shorter / longer)
# Token Jaccard (handles "Aaron Nelson" vs "Nelson, Aaron")
intersection = a_tokens & b_tokens
union = a_tokens | b_tokens
jaccard = len(intersection) / len(union)
# Acronym check (HVAMC vs Hudson Valley Additive Manufacturing Center)
def is_acronym(short, full):
if len(short) >= len(full):
return False
if not short.isupper():
short_upper = short.upper()
else:
short_upper = short
full_words = full.split()
if len(full_words) < 2:
return False
first_letters = ''.join(w[0].upper() for w in full_words if w)
return short_upper == first_letters or short_upper in first_letters
if is_acronym(name_a, name_b) or is_acronym(name_b, name_a):
return 0.85
return jaccard
def infer_type(entity_name, summary):
"""
Light type inference for blocking. Heuristic-based, transparent.
Returns one of: person, organization, project, place, concept, unknown.
NOT a precise classification — just enough to avoid obviously wrong
cross-type comparisons (person vs project). When in doubt, return
'unknown' which gets compared against everything.
"""
name_lower = entity_name.lower().strip()
summary_lower = (summary or "").lower()
# Person: name patterns
person_indicators = [
# First+Last name pattern (two title-cased words, no other tokens)
bool(re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+$', entity_name.strip())),
# Single name that's also in the summary as a person
any(phrase in summary_lower for phrase in [
'is a person', 'is a professor', 'is an artist', 'is a colleague',
'is a friend', 'is a family member', 'works at', 'studied at',
"'s spouse", "'s child", "'s parent", "'s student",
]),
]
if any(person_indicators):
return "person"
# Organization: company/institution indicators
org_indicators = [
any(suffix in name_lower for suffix in [
' inc', ' llc', ' corp', ' company', ' university', ' college',
' school', ' institute', ' foundation', ' department',
]),
any(phrase in summary_lower for phrase in [
'is a company', 'is a university', 'is an organization',
'is an institution', 'is a department', 'is a nonprofit',
]),
]
if any(org_indicators):
return "organization"
# Project: software/creative work indicators
project_indicators = [
any(phrase in summary_lower for phrase in [
'is a project', 'software project', 'is a codebase',
'is a tool', 'is a system', 'is an application',
'is a research project', 'is a design project',
]),
any(suffix in name_lower for suffix in [' project', ' system', ' platform']),
]
if any(project_indicators):
return "project"
# Place: location indicators
place_indicators = [
any(phrase in summary_lower for phrase in [
'is a city', 'is a town', 'is a state', 'is a country',
'is a neighborhood', 'is a region', 'is a location',
]),
]
if any(place_indicators):
return "place"
# Default
return "unknown"
def get_neighbors(graph, entity_uuid, limit=20):
"""Get the names of entities connected to this entity (1-hop)."""
query = """
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-(other:Entity)
RETURN DISTINCT other.name AS name
LIMIT $limit
"""
result = graph.query(query, {"uuid": entity_uuid, "limit": limit})
return set(row[0] for row in result.result_set if row[0])
def neighbor_jaccard(neighbors_a, neighbors_b):
"""
Asymmetric neighbor overlap (containment metric).
Returns |A ∩ B| / min(|A|, |B|) — the fraction of the smaller entity's
neighbors that are also neighbors of the larger entity.
Asymmetric is the right metric for personal cognitive corpora, where
one entity (e.g., the user) is a hub with hundreds of edges and alias
candidates are smaller subset entities. Jaccard penalizes this
asymmetry as if it were dissimilarity; containment reveals it as the
subset relationship it is.
DEG-RAG used Jaccard because their academic-corpus entities are
roughly comparable in connectivity. Personal corpora have different
topology and need a different metric.
"""
if not neighbors_a and not neighbors_b:
return 0.0
intersection = neighbors_a & neighbors_b
smaller = min(len(neighbors_a), len(neighbors_b))
if smaller == 0:
return 0.0
return len(intersection) / smaller
def get_edge_count(graph, entity_uuid):
query = """
MATCH (e:Entity {uuid: $uuid})-[r:RELATES_TO]-()
RETURN count(r) AS c
"""
result = graph.query(query, {"uuid": entity_uuid})
return result.result_set[0][0] if result.result_set else 0
def combine_signals(name_sim, ego_sim, neighbor_sim):
"""
Combine the three similarity signals into a single confidence score.
Weighting tuned for personal cognitive corpora:
- Summary embedding ego similarity is primary signal
- Containment-based neighbor overlap is strong secondary (catches Aaron+Nelson
where the smaller entity's neighbors are mostly a subset of the hub's)
- Name similarity is tie-breaker (handles acronyms via name_similarity helper)
Different from DEG-RAG defaults because personal corpora have asymmetric
topology (hub user, subset alias entities).
"""
# Strong neighbor containment alone is meaningful — if entity B's neighbors
# are mostly contained in entity A's, even with different names and weak
# name_embedding similarity, that's the asymmetric alias case (Aaron+Nelson).
# Require some ego support but not high.
if neighbor_sim >= 0.7 and ego_sim >= 0.3:
return 0.4 * neighbor_sim + 0.4 * ego_sim + 0.2 * name_sim
# If ego is very low AND neighbor overlap is weak, probably not aliases
if ego_sim < 0.3 and neighbor_sim < 0.4:
return min(0.4, max(ego_sim, neighbor_sim))
# If name is very similar AND ego is at least moderate, high confidence
if name_sim >= 0.85 and ego_sim >= 0.5:
return 0.4 * ego_sim + 0.4 * name_sim + 0.2 * neighbor_sim
# Standard weighted average — ego primary, neighbor and name balanced
return 0.45 * ego_sim + 0.3 * neighbor_sim + 0.25 * name_sim
def compute_summary_embedding(text, model="nomic-embed-text"):
"""
Compute embedding for a summary text via Ollama.
Used to get ego similarity between entities based on what their summaries
say (the actual semantic content) rather than just their names. Aaron's
name_embedding and Nelson's name_embedding have low cosine similarity
because the names are different tokens. But their summaries describe
overlapping content (faculty member at SUNY, HVAMC, etc.) so summary
embeddings should produce a much stronger ego signal.
"""
if not text or len(text) < 10:
return None
try:
response = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": model, "prompt": text[:2000]},
timeout=30,
)
response.raise_for_status()
return response.json().get("embedding")
except Exception as e:
print(f" Embedding error: {e}")
return None
def precompute_summary_embeddings(entities, model="nomic-embed-text"):
"""Compute and cache summary embeddings for all entities."""
print(f"Computing summary embeddings via Ollama ({model})...")
print(f" Total entities: {len(entities)}")
cache_path = Path("/home/aaron/aaronai/experiments/summary_embeddings_cache.json")
cache = {}
if cache_path.exists():
with open(cache_path) as f:
cache = json.load(f)
print(f" Loaded {len(cache)} cached embeddings")
new_count = 0
start = time.time()
for i, e in enumerate(entities):
if e["uuid"] in cache:
e["summary_embedding"] = cache[e["uuid"]]
continue
emb = compute_summary_embedding(e["summary"], model=model)
if emb:
e["summary_embedding"] = emb
cache[e["uuid"]] = emb
new_count += 1
else:
e["summary_embedding"] = None
# Save cache periodically
if new_count > 0 and new_count % 100 == 0:
with open(cache_path, "w") as f:
json.dump(cache, f)
elapsed = time.time() - start
rate = new_count / elapsed
remaining = (len(entities) - i - 1) / rate if rate > 0 else 0
print(f" ... {i+1}/{len(entities)} (computed {new_count} new, ~{remaining:.0f}s remaining)")
# Final save
with open(cache_path, "w") as f:
json.dump(cache, f)
have_embeddings = sum(1 for e in entities if e.get("summary_embedding"))
print(f" Done. {have_embeddings}/{len(entities)} entities have summary embeddings")
def generate_proposals():
db = FalkorDB(host='localhost', port=6379)
graph = db.select_graph(GROUP_ID)
# Pull all entities with embeddings
print(f"Fetching entities from group_id '{GROUP_ID}'...")
result = graph.query("""
MATCH (n:Entity)
WHERE n.name_embedding IS NOT NULL AND n.summary IS NOT NULL
RETURN n.uuid, n.name, n.summary, n.name_embedding
""")
entities = []
for row in result.result_set:
entities.append({
'uuid': row[0],
'name': row[1],
'summary': row[2],
'embedding': row[3],
})
print(f" Loaded {len(entities)} entities with embeddings")
# Compute summary embeddings (true ego signal, beyond name embeddings)
precompute_summary_embeddings(entities)
# Infer types for blocking
print("Inferring entity types for blocking...")
type_counts = defaultdict(int)
for e in entities:
e['inferred_type'] = infer_type(e['name'], e['summary'])
type_counts[e['inferred_type']] += 1
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t}: {c}")
# Group by inferred type for blocking
blocks = defaultdict(list)
for e in entities:
blocks[e['inferred_type']].append(e)
# 'unknown' entities get compared against everything (they might be any type)
# Other types only get compared within their type block + against unknowns
print()
print("Comparing entities within type blocks...")
proposals = []
low_confidence = []
comparisons_done = 0
# Build comparison pairs
pairs_to_compare = []
typed_blocks = {t: ents for t, ents in blocks.items() if t != 'unknown'}
unknown_block = blocks.get('unknown', [])
# Within-type pairs (excluding unknown)
for t, ents in typed_blocks.items():
for i in range(len(ents)):
for j in range(i + 1, len(ents)):
pairs_to_compare.append((ents[i], ents[j]))
# Unknown vs unknown
for i in range(len(unknown_block)):
for j in range(i + 1, len(unknown_block)):
pairs_to_compare.append((unknown_block[i], unknown_block[j]))
# Unknown vs typed (unknowns might be any type)
for ent_unknown in unknown_block:
for t, ents in typed_blocks.items():
for ent_typed in ents:
pairs_to_compare.append((ent_unknown, ent_typed))
print(f" Pairs to compare: {len(pairs_to_compare):,}")
# Compute similarities
cache_neighbors = {}
def neighbors_cached(uuid):
if uuid not in cache_neighbors:
cache_neighbors[uuid] = get_neighbors(graph, uuid)
return cache_neighbors[uuid]
for ent_a, ent_b in pairs_to_compare:
comparisons_done += 1
if comparisons_done % 5000 == 0:
print(f" ... {comparisons_done:,} / {len(pairs_to_compare):,}")
# Compute name similarity (handles formal/informal pairs, acronyms)
name_sim = name_similarity(ent_a['name'], ent_b['name'])
# Compute ego similarity using SUMMARY embeddings (the actual semantic
# content), falling back to name embeddings if summaries unavailable.
# Summary similarity catches Aaron+Nelson where name similarity fails.
if ent_a.get('summary_embedding') and ent_b.get('summary_embedding'):
ego_sim_quick = cosine_similarity(ent_a['summary_embedding'], ent_b['summary_embedding'])
else:
ego_sim_quick = cosine_similarity(ent_a['embedding'], ent_b['embedding'])
# Pre-filter to avoid expensive neighbor query on obviously different pairs.
# Lowered thresholds vs DEG-RAG defaults because personal-corpus aliases often
# have low name_embedding similarity (different surface tokens) but high
# neighbor overlap. We let weaker name/ego signals through to the neighbor
# check, which can rescue them via containment metric.
if ego_sim_quick < 0.3 and name_sim < 0.15:
continue
# Full comparison
neighbors_a = neighbors_cached(ent_a['uuid'])
neighbors_b = neighbors_cached(ent_b['uuid'])
neighbor_sim = neighbor_jaccard(neighbors_a, neighbors_b)
confidence = combine_signals(name_sim, ego_sim_quick, neighbor_sim)
record = {
'entity_a': {
'uuid': ent_a['uuid'],
'name': ent_a['name'],
'type': ent_a['inferred_type'],
'summary': ent_a['summary'][:200],
'edge_count': get_edge_count(graph, ent_a['uuid']),
},
'entity_b': {
'uuid': ent_b['uuid'],
'name': ent_b['name'],
'type': ent_b['inferred_type'],
'summary': ent_b['summary'][:200],
'edge_count': get_edge_count(graph, ent_b['uuid']),
},
'confidence': round(confidence, 3),
'signals': {
'name_similarity': round(name_sim, 3),
'ego_similarity': round(ego_sim_quick, 3),
'neighbor_overlap': round(neighbor_sim, 3),
},
'shared_neighbors': sorted(list(neighbors_a & neighbors_b))[:10],
}
if confidence >= HIGH_CONFIDENCE_THRESHOLD:
proposals.append(record)
elif confidence >= LOW_CONFIDENCE_THRESHOLD:
low_confidence.append(record)
print(f"\nDone. Proposals: {len(proposals)}, Low-confidence: {len(low_confidence)}")
return proposals, low_confidence, len(entities), len(pairs_to_compare)
def write_proposals_log(proposals, low_confidence, total_entities, total_comparisons):
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H%M")
out_path = PROPOSALS_DIR / f"proposals-{timestamp}.md"
proposals_sorted = sorted(proposals, key=lambda p: -p['confidence'])
low_sorted = sorted(low_confidence, key=lambda p: -p['confidence'])
lines = []
lines.append(f"# Consolidator 0.1 — Run {timestamp}")
lines.append("")
lines.append("## Statistics")
lines.append(f"- Entities scanned: {total_entities:,}")
lines.append(f"- Pairwise comparisons: {total_comparisons:,}")
lines.append(f"- High-confidence proposals (≥{HIGH_CONFIDENCE_THRESHOLD}): {len(proposals)}")
lines.append(f"- Low-confidence candidates ({LOW_CONFIDENCE_THRESHOLD}-{HIGH_CONFIDENCE_THRESHOLD}): {len(low_confidence)}")
lines.append("")
lines.append("## How to review")
lines.append("")
lines.append("For each proposal, mark your decision by changing `[ ]` to one of:")
lines.append("- `[APPROVE]` — execute this merge on next run")
lines.append("- `[REJECT]` — don't merge, don't propose again")
lines.append("- `[DEFER]` — re-surface in next run for further consideration")
lines.append("")
lines.append("Save the file when done. Do not modify proposal_id or uuid fields.")
lines.append("")
lines.append("---")
lines.append("")
lines.append(f"## Proposed Merges (n={len(proposals)})")
lines.append("")
for i, p in enumerate(proposals_sorted, start=1):
lines.append(f"### Proposal {i}")
lines.append("")
lines.append(f"**Decision:** [ ]")
lines.append("")
lines.append(f"**Confidence:** {p['confidence']}")
lines.append("")
lines.append(f"**Entity A:** \"{p['entity_a']['name']}\" (type: {p['entity_a']['type']}, {p['entity_a']['edge_count']} edges)")
lines.append(f" - uuid: `{p['entity_a']['uuid']}`")
lines.append(f" - summary: {p['entity_a']['summary']}")
lines.append("")
lines.append(f"**Entity B:** \"{p['entity_b']['name']}\" (type: {p['entity_b']['type']}, {p['entity_b']['edge_count']} edges)")
lines.append(f" - uuid: `{p['entity_b']['uuid']}`")
lines.append(f" - summary: {p['entity_b']['summary']}")
lines.append("")
lines.append(f"**Signals:**")
lines.append(f" - Name similarity: {p['signals']['name_similarity']}")
lines.append(f" - Ego (summary) similarity: {p['signals']['ego_similarity']}")
lines.append(f" - Neighbor overlap: {p['signals']['neighbor_overlap']}")
if p['shared_neighbors']:
shared_str = ', '.join(f'"{n}"' for n in p['shared_neighbors'][:8])
lines.append(f" - Shared neighbors (sample): {shared_str}")
lines.append("")
lines.append("**Optional rejection note:** ")
lines.append("")
lines.append("---")
lines.append("")
lines.append("")
lines.append(f"## Low-Confidence Candidates (n={len(low_confidence)}, informational only, no action)")
lines.append("")
for p in low_sorted[:30]:
lines.append(f"- **{p['confidence']}** \"{p['entity_a']['name']}\" + \"{p['entity_b']['name']}\" (name={p['signals']['name_similarity']}, ego={p['signals']['ego_similarity']}, nbr={p['signals']['neighbor_overlap']})")
if len(low_sorted) > 30:
lines.append(f"- *(...{len(low_sorted) - 30} more not shown)*")
out_path.write_text("\n".join(lines))
print(f"\nProposal log written to: {out_path}")
# Also save raw JSON for downstream tooling
json_path = PROPOSALS_DIR / f"proposals-{timestamp}.json"
with open(json_path, 'w') as f:
json.dump({
'run_timestamp': timestamp,
'statistics': {
'total_entities': total_entities,
'total_comparisons': total_comparisons,
'proposal_count': len(proposals),
'low_confidence_count': len(low_confidence),
},
'proposals': proposals_sorted,
'low_confidence': low_sorted,
}, f, indent=2)
print(f"Raw JSON: {json_path}")
def main():
print("=" * 70)
print("Consolidator 0.1 — Calibration Phase")
print("=" * 70)
print()
proposals, low_confidence, total_entities, total_comparisons = generate_proposals()
write_proposals_log(proposals, low_confidence, total_entities, total_comparisons)
print()
print("Next: review the proposals markdown file and mark APPROVE/REJECT/DEFER")
print("for each proposal. Re-run will read decisions and execute approved merges.")
if __name__ == "__main__":
main()