chore: archive deprecated chromadb and migration scripts

This commit is contained in:
2026-04-28 00:15:46 +00:00
parent d5b5c2ec14
commit 037d747573
10 changed files with 486 additions and 11 deletions
+250
View File
@@ -0,0 +1,250 @@
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import chromadb
from sentence_transformers import SentenceTransformer
import anthropic
from datetime import datetime
load_dotenv(Path.home() / "aaronai" / ".env")
memory_path = Path.home() / "aaronai" / "memory.md"
db_path = str(Path.home() / "aaronai" / "db")
print("Loading Aaron AI...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
chroma_client = chromadb.PersistentClient(path=db_path)
collection = chroma_client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine"}
)
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
called FWN3D. He has a background in graffiti lettering and vector illustration.
You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
Use this context to give answers grounded in his actual work and history. When helping him write
or create, match his voice and draw on his existing materials. Be direct and specific -
Aaron values precision over padding. Always cite which documents you drew from when relevant.
You have access to web search. Use it automatically when:
- Questions require current data (salaries, job postings, prices, news)
- Questions reference specific institutions, people, or organizations you need to verify
- Aaron's documents and memory don't contain sufficient information to answer well
Do not announce that you are searching. Just search and incorporate results naturally."""
CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
conversation_history = []
TOOLS = [
{
"type": "web_search_20250305",
"name": "web_search"
}
]
def load_memory():
if memory_path.exists():
return memory_path.read_text(encoding="utf-8")
return ""
def save_memory(content):
memory_path.write_text(content, encoding="utf-8")
def add_to_memory(new_item):
memory = load_memory()
timestamp = datetime.now().strftime("%Y-%m-%d")
note = f"\n- [{timestamp}] {new_item}"
if "## Notes" not in memory:
memory += "\n\n## Notes"
memory += note
save_memory(memory)
def remove_from_memory(item):
memory = load_memory()
lines = memory.split("\n")
filtered = [l for l in lines if item.lower() not in l.lower()]
save_memory("\n".join(filtered))
return len(lines) - len(filtered)
def get_pinned_cv_context():
results = collection.get(
where={"source": "Aaron Nelson CV 2024.pdf"},
include=["documents", "metadatas"]
)
return results["documents"], results["metadatas"]
def is_professional_query(query):
keywords = [
"grant", "publication", "exhibition", "award", "fellowship",
"experience", "position", "job", "career", "cv", "resume",
"research", "work history", "accomplishment", "teaching",
"course", "client", "consultation", "presentation", "workshop",
"education", "degree", "institution", "service", "committee"
]
return any(keyword in query.lower() for keyword in keywords)
def retrieve_context(query, n_results=8):
query_embedding = embedder.encode([query]).tolist()
results = collection.query(
query_embeddings=query_embedding,
n_results=n_results,
include=["documents", "metadatas", "distances"]
)
context_pieces = []
sources = []
if is_professional_query(query):
cv_docs, cv_metas = get_pinned_cv_context()
for doc, meta in zip(cv_docs, cv_metas):
context_pieces.append(f"[CV] {doc}")
sources.append(meta["source"])
for doc, meta, dist in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0]
):
relevance = 1 - dist
if relevance > 0.3 and meta["source"] not in CV_SOURCES:
context_pieces.append(doc)
sources.append(meta["source"])
return context_pieces, sources
def handle_command(user_input):
stripped = user_input.strip().lower()
if stripped == "show memory":
memory = load_memory()
print(f"\nAaron AI: Current memory:\n\n{memory}")
return True
if stripped.startswith("remember:"):
item = user_input[9:].strip()
add_to_memory(item)
print(f"\nAaron AI: Saved to memory: '{item}'")
return True
if stripped.startswith("forget:"):
item = user_input[7:].strip()
removed = remove_from_memory(item)
if removed:
print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
else:
print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
return True
if stripped == "clear":
conversation_history.clear()
print("\nAaron AI: Conversation history cleared.")
return True
return False
def chat(user_message):
memory = load_memory()
context_pieces, sources = retrieve_context(user_message)
context_parts = []
if memory:
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
if context_pieces:
context_str = "\n\n---\n\n".join(context_pieces)
unique_sources = list(set(sources))
context_parts.append(
f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
)
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
full_message = context_block + user_message
# Build messages for this turn
messages = conversation_history + [{"role": "user", "content": full_message}]
# Agentic loop to handle tool use
while True:
response = anthropic_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
system=SYSTEM_PROMPT,
tools=TOOLS,
messages=messages
)
# Check if we need to handle tool calls
if response.stop_reason == "tool_use":
# Add assistant response to messages
messages.append({"role": "assistant", "content": response.content})
# Process each tool use block
tool_results = []
for block in response.content:
if block.type == "tool_use":
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": "Search completed"
})
# Add tool results and continue
messages.append({"role": "user", "content": tool_results})
else:
# Final response - extract text
assistant_message = ""
for block in response.content:
if hasattr(block, "text"):
assistant_message += block.text
# Update conversation history with clean versions
conversation_history.append({"role": "user", "content": full_message})
conversation_history.append({"role": "assistant", "content": assistant_message})
if len(conversation_history) > 20:
conversation_history.pop(0)
conversation_history.pop(0)
return assistant_message, sources
def main():
print("Aaron AI ready. Corpus, memory, and web search loaded.")
print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
print("=" * 60)
while True:
try:
user_input = input("\nYou: ").strip()
if not user_input:
continue
if user_input.strip().lower() == "quit":
print("Goodbye.")
break
if handle_command(user_input):
continue
response, sources = chat(user_input)
print(f"\nAaron AI: {response}")
if sources:
unique = list(set(sources))
print(f"\n[Sources: {', '.join(unique)}]")
except KeyboardInterrupt:
print("\nGoodbye.")
break
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()
+152
View File
@@ -0,0 +1,152 @@
import json
import sys
from pathlib import Path
from datetime import datetime
from sentence_transformers import SentenceTransformer
import psycopg2
import psycopg2.extras
import json as json_module
# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)
def extract_messages(convo):
"""Extract ordered user/assistant messages from a conversation."""
mapping = convo.get("mapping", {})
messages = []
for node in mapping.values():
msg = node.get("message")
if not msg:
continue
role = msg.get("author", {}).get("role")
if role not in ["user", "assistant"]:
continue
content = msg.get("content", {})
parts = content.get("parts", [])
# Extract text parts only
text = ""
for part in parts:
if isinstance(part, str):
text += part
elif isinstance(part, dict) and part.get("content_type") == "text":
text += part.get("text", "")
text = text.strip()
if not text:
continue
create_time = msg.get("create_time") or 0
messages.append((create_time, role, text))
# Sort by timestamp
messages.sort(key=lambda x: x[0])
return messages
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
"""Convert a conversation into overlapping text chunks."""
# Build full conversation text
lines = [f"[Conversation: {title}]", ""]
for _, role, text in messages:
label = "Aaron" if role == "user" else "ChatGPT"
lines.append(f"{label}: {text}")
lines.append("")
full_text = "\n".join(lines)
# Split into word-level chunks with overlap
words = full_text.split()
chunks = []
start = 0
while start < len(words):
end = start + chunk_size
chunk = " ".join(words[start:end])
if chunk.strip():
chunks.append(chunk)
start += chunk_size - overlap
return chunks
def ingest_file(json_path):
print(f"\nLoading {json_path.name}...")
data = json.load(open(json_path, encoding="utf-8"))
print(f"Found {len(data)} conversations")
total_chunks = 0
skipped = 0
for i, convo in enumerate(data):
title = convo.get("title", "Untitled")
convo_id = convo.get("id", f"convo_{i}")
create_time = convo.get("create_time", 0)
try:
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
except:
date_str = "unknown"
messages = extract_messages(convo)
if len(messages) < 2:
skipped += 1
continue
chunks = chunk_conversation(title, messages)
if not chunks:
skipped += 1
continue
# Embed and store
embeddings = embedder.encode(chunks).tolist()
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
metadatas = [{
"source": f"ChatGPT: {title}",
"filepath": str(json_path),
"date": date_str,
"type": "chatgpt_conversation"
} for _ in chunks]
collection.upsert(
documents=chunks,
embeddings=embeddings,
ids=ids,
metadatas=metadatas
)
total_chunks += len(chunks)
print(f" [{i+1}/{len(data)}] {title[:60]}{len(chunks)} chunks ({date_str})")
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
return total_chunks
def main():
export_dir = Path(EXPORT_DIR)
files = [
export_dir / "conversations-000.json",
export_dir / "conversations-001.json"
]
grand_total = 0
for f in files:
if f.exists():
grand_total += ingest_file(f)
else:
print(f"Not found: {f}")
print(f"\nTotal chunks added to corpus: {grand_total}")
print(f"Database at: {db_path}")
if __name__ == "__main__":
main()
+189
View File
@@ -0,0 +1,189 @@
import json
import sys
from pathlib import Path
from sentence_transformers import SentenceTransformer
import psycopg2
import psycopg2.extras
import json as json_module
# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)
def extract_messages(convo):
messages = []
for msg in convo.get("chat_messages", []):
role = msg.get("sender", "")
if role not in ["human", "assistant"]:
continue
content = msg.get("content", [])
text = ""
if isinstance(content, str):
text = content
elif isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
text += block.get("text", "")
elif isinstance(block, str):
text += block
text = text.strip()
if not text:
continue
messages.append((msg.get("created_at", ""), role, text))
return messages
def chunk_conversation(convo):
chunks = []
title = convo.get("name", "Untitled conversation")
uuid = convo.get("uuid", "")
created_at = convo.get("created_at", "")
messages = extract_messages(convo)
if not messages:
return chunks
window = []
for i, (ts, role, text) in enumerate(messages):
label = "You" if role == "human" else "Claude"
window.append(f"{label}: {text}")
if len(window) >= 3 or i == len(messages) - 1:
chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
chunk_id = f"claude_{uuid}_{i}"
chunks.append((chunk_id, chunk_text, {
"source": f"Claude: {title}",
"type": "claude_conversation",
"created_at": created_at,
}))
window = window[-1:]
return chunks
def ingest_conversations(path):
print(f"\nIngesting conversations from {path.name}...")
conversations = []
# Handle both .json (array) and .jsonl (one per line)
raw = path.read_text(encoding="utf-8").strip()
if raw.startswith("["):
conversations = json.loads(raw)
else:
for line in raw.splitlines():
line = line.strip()
if line:
try:
conversations.append(json.loads(line))
except:
continue
print(f"Found {len(conversations)} conversations")
total = 0
skipped = 0
for convo in conversations:
chunks = chunk_conversation(convo)
if not chunks:
skipped += 1
continue
ids = [c[0] for c in chunks]
texts = [c[1] for c in chunks]
metas = [c[2] for c in chunks]
existing = collection.get(ids=ids)
existing_ids = set(existing["ids"])
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
if id not in existing_ids]
if not new:
continue
embeddings = embedder.encode([n[1] for n in new]).tolist()
pg = get_pg()
cur = pg.cursor()
for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
cur.execute("""
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
document = EXCLUDED.document,
embedding = EXCLUDED.embedding,
source = EXCLUDED.source,
type = EXCLUDED.type,
created_at = EXCLUDED.created_at,
metadata = EXCLUDED.metadata
""", (
chunk_id, chunk_text, embedding,
meta.get('source'), meta.get('type'), meta.get('created_at'),
json_module.dumps(meta)
))
pg.commit()
pg.close()
total += len(new)
print(f"Conversations: {total} chunks added, {skipped} skipped")
return total
def ingest_memories(path):
print(f"\nIngesting memories from {path.name}...")
raw = json.loads(path.read_text(encoding="utf-8"))
# Memories are a list of memory objects
memories = raw if isinstance(raw, list) else raw.get("memories", [])
if not memories:
print("No memories found")
return 0
# Combine all memories into one chunk — they're already distilled
memory_text = "\n".join([
f"- {m.get('content', m) if isinstance(m, dict) else m}"
for m in memories
])
chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
chunk_id = "claude_memories_consolidated"
existing = collection.get(ids=[chunk_id])
if existing["ids"]:
# Update by deleting and re-adding
collection.delete(ids=[chunk_id])
embedding = embedder.encode([chunk_text]).tolist()
collection.upsert(
ids=[chunk_id],
documents=[chunk_text],
metadatas=[{
"source": "Claude: Memory",
"type": "claude_memory",
}],
embeddings=embedding,
)
print(f"Memories: 1 chunk added ({len(memories)} memory items)")
return 1
# Run ingestion
export_dir = Path(EXPORT_DIR)
total = 0
conv_files = list(export_dir.glob("conversations.*"))
for f in conv_files:
total += ingest_conversations(f)
mem_files = list(export_dir.glob("memories.*"))
for f in mem_files:
total += ingest_memories(f)
if total == 0:
print("\nNo files found or no new chunks to add.")
else:
print(f"\nTotal chunks added to corpus: {total}")
# Show updated corpus size
count = collection.count()
print(f"Corpus now contains {count} total chunks")
+91
View File
@@ -0,0 +1,91 @@
"""
Aaron AI — Migration: pgvector to Graphiti
One-time migration. Test with limit first: python3 migrate_to_graphiti.py 100
"""
import os, sys, json, time, requests, psycopg2
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
load_dotenv(Path.home() / "aaronai" / ".env")
GRAPHITI_URL = "http://localhost:8001"
PG_DSN = os.getenv("PG_DSN")
GROUP_ID = "aaron"
BATCH_PAUSE = 0.5
PROGRESS_FILE = Path.home() / "aaronai" / "migration_progress.json"
def load_progress():
if PROGRESS_FILE.exists():
return json.loads(PROGRESS_FILE.read_text())
return {"completed_ids": [], "failed_ids": []}
def save_progress(progress):
PROGRESS_FILE.write_text(json.dumps(progress, indent=2))
def migrate(limit=None):
try:
resp = requests.get(f"{GRAPHITI_URL}/health", timeout=5)
print(f"Graphiti: {resp.json()}")
except Exception as e:
print(f"ERROR: sidecar not reachable — {e}"); sys.exit(1)
progress = load_progress()
completed_ids = set(progress["completed_ids"])
failed_ids = progress["failed_ids"]
if completed_ids:
print(f"Resuming — {len(completed_ids)} done, {len(failed_ids)} failed")
pg = psycopg2.connect(PG_DSN)
cur = pg.cursor()
query = "SELECT id, document, source, created_at FROM embeddings ORDER BY created_at ASC"
if limit:
query += f" LIMIT {limit}"
cur.execute(query)
rows = cur.fetchall()
pg.close()
pending = [r for r in rows if r[0] not in completed_ids]
print(f"Total: {len(rows)} | Pending: {len(pending)}{' [TEST]' if limit else ''}\n")
success = len(completed_ids)
failed = len(failed_ids)
start = time.time()
for i, (id, document, source, created_at) in enumerate(pending):
try:
src = (source or "unknown").replace("/", "-").replace(" ", "-")[:80]
name = f"{src}-{id[:8]}"
requests.post(f"{GRAPHITI_URL}/episodes", json={
"name": name,
"content": document,
"source_description": source or "nextcloud-corpus",
"timestamp": created_at or datetime.now().isoformat(),
"group_id": GROUP_ID,
}, timeout=120).raise_for_status()
success += 1
progress["completed_ids"].append(id)
if success % 10 == 0:
save_progress(progress)
if (i + 1) % 50 == 0:
elapsed = time.time() - start
rate = (i + 1) / elapsed
remaining = (len(pending) - i - 1) / rate if rate > 0 else 0
print(f" [{i+1}/{len(pending)}] {success} ok, {failed} failed | ~{remaining/60:.0f} min left")
time.sleep(BATCH_PAUSE)
except Exception as e:
failed += 1
progress["failed_ids"].append({"id": id, "error": str(e)})
print(f" FAILED {id}: {e}")
save_progress(progress)
time.sleep(2)
save_progress(progress)
elapsed = time.time() - start
print(f"\nDone — {success} ok, {failed} failed, {elapsed/60:.1f} min")
if limit and len(pending) > 0:
est = (elapsed / len(pending)) * 12915 / 60
print(f"Estimated full run: ~{est:.0f} min")
if __name__ == "__main__":
migrate(int(sys.argv[1]) if len(sys.argv) > 1 else None)
+125
View File
@@ -0,0 +1,125 @@
"""
Migration: ChromaDB → pgvector
Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
Keeps ChromaDB intact as backup until migration is verified.
"""
import sqlite3
import psycopg2
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
import os
PG_DSN = os.getenv("PG_DSN")
if not PG_DSN:
raise RuntimeError("PG_DSN environment variable not set")
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("Connecting to databases...")
chroma = sqlite3.connect(CHROMA_SQLITE)
chroma.row_factory = sqlite3.Row
c = chroma.cursor()
pg = psycopg2.connect(PG_DSN)
pg_cur = pg.cursor()
# Get all documents with their metadata from ChromaDB
print("Reading documents from ChromaDB...")
c.execute("""
SELECT
e.id as row_id,
e.embedding_id,
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
FROM embeddings e
LEFT JOIN embedding_metadata em ON e.id = em.id
GROUP BY e.id, e.embedding_id
HAVING document IS NOT NULL
ORDER BY e.id
""")
rows = c.fetchall()
print(f"Found {len(rows)} documents to migrate")
# Check existing in PostgreSQL
pg_cur.execute("SELECT id FROM embeddings")
existing_ids = set(r[0] for r in pg_cur.fetchall())
print(f"Already in PostgreSQL: {len(existing_ids)}")
# Filter to only new ones
to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
print(f"Need to migrate: {len(to_migrate)}")
if not to_migrate:
print("Nothing to migrate — already complete")
chroma.close()
pg.close()
exit(0)
# Migrate in batches
batch_size = 200
migrated = 0
errors = 0
for i in range(0, len(to_migrate), batch_size):
batch = to_migrate[i:i+batch_size]
# Generate embeddings
texts = [r['document'] for r in batch]
try:
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
except Exception as e:
print(f"Embedding error at batch {i}: {e}")
errors += len(batch)
continue
# Insert into PostgreSQL
for row, embedding in zip(batch, embeddings):
try:
pg_cur.execute("""
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
document = EXCLUDED.document,
embedding = EXCLUDED.embedding,
source = EXCLUDED.source,
type = EXCLUDED.type,
created_at = EXCLUDED.created_at,
metadata = EXCLUDED.metadata
""", (
row['embedding_id'],
row['document'],
embedding,
row['source'],
row['type'],
row['created_at'],
json.dumps({
'source': row['source'],
'type': row['type'],
'created_at': row['created_at'],
})
))
migrated += 1
except Exception as e:
print(f"Insert error for {row['embedding_id']}: {e}")
errors += 1
pg.commit()
print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
# Final count
pg_cur.execute("SELECT COUNT(*) FROM embeddings")
final_count = pg_cur.fetchone()[0]
chroma.close()
pg.close()
print(f"\nMigration complete:")
print(f" Migrated: {migrated}")
print(f" Errors: {errors}")
print(f" PostgreSQL total: {final_count}")