chore: archive deprecated chromadb and migration scripts
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import chromadb
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import anthropic
|
||||
from datetime import datetime
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
memory_path = Path.home() / "aaronai" / "memory.md"
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
|
||||
print("Loading Aaron AI...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
chroma_client = chromadb.PersistentClient(path=db_path)
|
||||
collection = chroma_client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine"}
|
||||
)
|
||||
anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||
|
||||
SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
|
||||
of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
|
||||
Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
|
||||
and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
|
||||
3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
|
||||
called FWN3D. He has a background in graffiti lettering and vector illustration.
|
||||
|
||||
You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
|
||||
Use this context to give answers grounded in his actual work and history. When helping him write
|
||||
or create, match his voice and draw on his existing materials. Be direct and specific -
|
||||
Aaron values precision over padding. Always cite which documents you drew from when relevant.
|
||||
|
||||
You have access to web search. Use it automatically when:
|
||||
- Questions require current data (salaries, job postings, prices, news)
|
||||
- Questions reference specific institutions, people, or organizations you need to verify
|
||||
- Aaron's documents and memory don't contain sufficient information to answer well
|
||||
Do not announce that you are searching. Just search and incorporate results naturally."""
|
||||
|
||||
CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
|
||||
conversation_history = []
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "web_search_20250305",
|
||||
"name": "web_search"
|
||||
}
|
||||
]
|
||||
|
||||
def load_memory():
|
||||
if memory_path.exists():
|
||||
return memory_path.read_text(encoding="utf-8")
|
||||
return ""
|
||||
|
||||
def save_memory(content):
|
||||
memory_path.write_text(content, encoding="utf-8")
|
||||
|
||||
def add_to_memory(new_item):
|
||||
memory = load_memory()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d")
|
||||
note = f"\n- [{timestamp}] {new_item}"
|
||||
if "## Notes" not in memory:
|
||||
memory += "\n\n## Notes"
|
||||
memory += note
|
||||
save_memory(memory)
|
||||
|
||||
def remove_from_memory(item):
|
||||
memory = load_memory()
|
||||
lines = memory.split("\n")
|
||||
filtered = [l for l in lines if item.lower() not in l.lower()]
|
||||
save_memory("\n".join(filtered))
|
||||
return len(lines) - len(filtered)
|
||||
|
||||
def get_pinned_cv_context():
|
||||
results = collection.get(
|
||||
where={"source": "Aaron Nelson CV 2024.pdf"},
|
||||
include=["documents", "metadatas"]
|
||||
)
|
||||
return results["documents"], results["metadatas"]
|
||||
|
||||
def is_professional_query(query):
|
||||
keywords = [
|
||||
"grant", "publication", "exhibition", "award", "fellowship",
|
||||
"experience", "position", "job", "career", "cv", "resume",
|
||||
"research", "work history", "accomplishment", "teaching",
|
||||
"course", "client", "consultation", "presentation", "workshop",
|
||||
"education", "degree", "institution", "service", "committee"
|
||||
]
|
||||
return any(keyword in query.lower() for keyword in keywords)
|
||||
|
||||
def retrieve_context(query, n_results=8):
|
||||
query_embedding = embedder.encode([query]).tolist()
|
||||
results = collection.query(
|
||||
query_embeddings=query_embedding,
|
||||
n_results=n_results,
|
||||
include=["documents", "metadatas", "distances"]
|
||||
)
|
||||
|
||||
context_pieces = []
|
||||
sources = []
|
||||
|
||||
if is_professional_query(query):
|
||||
cv_docs, cv_metas = get_pinned_cv_context()
|
||||
for doc, meta in zip(cv_docs, cv_metas):
|
||||
context_pieces.append(f"[CV] {doc}")
|
||||
sources.append(meta["source"])
|
||||
|
||||
for doc, meta, dist in zip(
|
||||
results["documents"][0],
|
||||
results["metadatas"][0],
|
||||
results["distances"][0]
|
||||
):
|
||||
relevance = 1 - dist
|
||||
if relevance > 0.3 and meta["source"] not in CV_SOURCES:
|
||||
context_pieces.append(doc)
|
||||
sources.append(meta["source"])
|
||||
|
||||
return context_pieces, sources
|
||||
|
||||
def handle_command(user_input):
|
||||
stripped = user_input.strip().lower()
|
||||
|
||||
if stripped == "show memory":
|
||||
memory = load_memory()
|
||||
print(f"\nAaron AI: Current memory:\n\n{memory}")
|
||||
return True
|
||||
|
||||
if stripped.startswith("remember:"):
|
||||
item = user_input[9:].strip()
|
||||
add_to_memory(item)
|
||||
print(f"\nAaron AI: Saved to memory: '{item}'")
|
||||
return True
|
||||
|
||||
if stripped.startswith("forget:"):
|
||||
item = user_input[7:].strip()
|
||||
removed = remove_from_memory(item)
|
||||
if removed:
|
||||
print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
|
||||
else:
|
||||
print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
|
||||
return True
|
||||
|
||||
if stripped == "clear":
|
||||
conversation_history.clear()
|
||||
print("\nAaron AI: Conversation history cleared.")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def chat(user_message):
|
||||
memory = load_memory()
|
||||
context_pieces, sources = retrieve_context(user_message)
|
||||
|
||||
context_parts = []
|
||||
if memory:
|
||||
context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
|
||||
if context_pieces:
|
||||
context_str = "\n\n---\n\n".join(context_pieces)
|
||||
unique_sources = list(set(sources))
|
||||
context_parts.append(
|
||||
f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
|
||||
)
|
||||
|
||||
context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
|
||||
full_message = context_block + user_message
|
||||
|
||||
# Build messages for this turn
|
||||
messages = conversation_history + [{"role": "user", "content": full_message}]
|
||||
|
||||
# Agentic loop to handle tool use
|
||||
while True:
|
||||
response = anthropic_client.messages.create(
|
||||
model="claude-sonnet-4-6",
|
||||
max_tokens=2048,
|
||||
system=SYSTEM_PROMPT,
|
||||
tools=TOOLS,
|
||||
messages=messages
|
||||
)
|
||||
|
||||
# Check if we need to handle tool calls
|
||||
if response.stop_reason == "tool_use":
|
||||
# Add assistant response to messages
|
||||
messages.append({"role": "assistant", "content": response.content})
|
||||
|
||||
# Process each tool use block
|
||||
tool_results = []
|
||||
for block in response.content:
|
||||
if block.type == "tool_use":
|
||||
tool_results.append({
|
||||
"type": "tool_result",
|
||||
"tool_use_id": block.id,
|
||||
"content": "Search completed"
|
||||
})
|
||||
|
||||
# Add tool results and continue
|
||||
messages.append({"role": "user", "content": tool_results})
|
||||
|
||||
else:
|
||||
# Final response - extract text
|
||||
assistant_message = ""
|
||||
for block in response.content:
|
||||
if hasattr(block, "text"):
|
||||
assistant_message += block.text
|
||||
|
||||
# Update conversation history with clean versions
|
||||
conversation_history.append({"role": "user", "content": full_message})
|
||||
conversation_history.append({"role": "assistant", "content": assistant_message})
|
||||
|
||||
if len(conversation_history) > 20:
|
||||
conversation_history.pop(0)
|
||||
conversation_history.pop(0)
|
||||
|
||||
return assistant_message, sources
|
||||
|
||||
def main():
|
||||
print("Aaron AI ready. Corpus, memory, and web search loaded.")
|
||||
print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
|
||||
print("=" * 60)
|
||||
|
||||
while True:
|
||||
try:
|
||||
user_input = input("\nYou: ").strip()
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
if user_input.strip().lower() == "quit":
|
||||
print("Goodbye.")
|
||||
break
|
||||
|
||||
if handle_command(user_input):
|
||||
continue
|
||||
|
||||
response, sources = chat(user_input)
|
||||
print(f"\nAaron AI: {response}")
|
||||
|
||||
if sources:
|
||||
unique = list(set(sources))
|
||||
print(f"\n[Sources: {', '.join(unique)}]")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nGoodbye.")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,152 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json as json_module
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
"""Extract ordered user/assistant messages from a conversation."""
|
||||
mapping = convo.get("mapping", {})
|
||||
messages = []
|
||||
|
||||
for node in mapping.values():
|
||||
msg = node.get("message")
|
||||
if not msg:
|
||||
continue
|
||||
|
||||
role = msg.get("author", {}).get("role")
|
||||
if role not in ["user", "assistant"]:
|
||||
continue
|
||||
|
||||
content = msg.get("content", {})
|
||||
parts = content.get("parts", [])
|
||||
|
||||
# Extract text parts only
|
||||
text = ""
|
||||
for part in parts:
|
||||
if isinstance(part, str):
|
||||
text += part
|
||||
elif isinstance(part, dict) and part.get("content_type") == "text":
|
||||
text += part.get("text", "")
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
create_time = msg.get("create_time") or 0
|
||||
messages.append((create_time, role, text))
|
||||
|
||||
# Sort by timestamp
|
||||
messages.sort(key=lambda x: x[0])
|
||||
return messages
|
||||
|
||||
def chunk_conversation(title, messages, chunk_size=600, overlap=100):
|
||||
"""Convert a conversation into overlapping text chunks."""
|
||||
# Build full conversation text
|
||||
lines = [f"[Conversation: {title}]", ""]
|
||||
for _, role, text in messages:
|
||||
label = "Aaron" if role == "user" else "ChatGPT"
|
||||
lines.append(f"{label}: {text}")
|
||||
lines.append("")
|
||||
|
||||
full_text = "\n".join(lines)
|
||||
|
||||
# Split into word-level chunks with overlap
|
||||
words = full_text.split()
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + chunk_size
|
||||
chunk = " ".join(words[start:end])
|
||||
if chunk.strip():
|
||||
chunks.append(chunk)
|
||||
start += chunk_size - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
def ingest_file(json_path):
|
||||
print(f"\nLoading {json_path.name}...")
|
||||
data = json.load(open(json_path, encoding="utf-8"))
|
||||
print(f"Found {len(data)} conversations")
|
||||
|
||||
total_chunks = 0
|
||||
skipped = 0
|
||||
|
||||
for i, convo in enumerate(data):
|
||||
title = convo.get("title", "Untitled")
|
||||
convo_id = convo.get("id", f"convo_{i}")
|
||||
create_time = convo.get("create_time", 0)
|
||||
|
||||
try:
|
||||
date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
|
||||
except:
|
||||
date_str = "unknown"
|
||||
|
||||
messages = extract_messages(convo)
|
||||
|
||||
if len(messages) < 2:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
chunks = chunk_conversation(title, messages)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Embed and store
|
||||
embeddings = embedder.encode(chunks).tolist()
|
||||
ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
|
||||
metadatas = [{
|
||||
"source": f"ChatGPT: {title}",
|
||||
"filepath": str(json_path),
|
||||
"date": date_str,
|
||||
"type": "chatgpt_conversation"
|
||||
} for _ in chunks]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
metadatas=metadatas
|
||||
)
|
||||
|
||||
total_chunks += len(chunks)
|
||||
print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
|
||||
|
||||
print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
|
||||
return total_chunks
|
||||
|
||||
def main():
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
files = [
|
||||
export_dir / "conversations-000.json",
|
||||
export_dir / "conversations-001.json"
|
||||
]
|
||||
|
||||
grand_total = 0
|
||||
for f in files:
|
||||
if f.exists():
|
||||
grand_total += ingest_file(f)
|
||||
else:
|
||||
print(f"Not found: {f}")
|
||||
|
||||
print(f"\nTotal chunks added to corpus: {grand_total}")
|
||||
print(f"Database at: {db_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,189 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json as json_module
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
messages = []
|
||||
for msg in convo.get("chat_messages", []):
|
||||
role = msg.get("sender", "")
|
||||
if role not in ["human", "assistant"]:
|
||||
continue
|
||||
content = msg.get("content", [])
|
||||
text = ""
|
||||
if isinstance(content, str):
|
||||
text = content
|
||||
elif isinstance(content, list):
|
||||
for block in content:
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
text += block.get("text", "")
|
||||
elif isinstance(block, str):
|
||||
text += block
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
messages.append((msg.get("created_at", ""), role, text))
|
||||
return messages
|
||||
|
||||
def chunk_conversation(convo):
|
||||
chunks = []
|
||||
title = convo.get("name", "Untitled conversation")
|
||||
uuid = convo.get("uuid", "")
|
||||
created_at = convo.get("created_at", "")
|
||||
messages = extract_messages(convo)
|
||||
if not messages:
|
||||
return chunks
|
||||
|
||||
window = []
|
||||
for i, (ts, role, text) in enumerate(messages):
|
||||
label = "You" if role == "human" else "Claude"
|
||||
window.append(f"{label}: {text}")
|
||||
if len(window) >= 3 or i == len(messages) - 1:
|
||||
chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
|
||||
chunk_id = f"claude_{uuid}_{i}"
|
||||
chunks.append((chunk_id, chunk_text, {
|
||||
"source": f"Claude: {title}",
|
||||
"type": "claude_conversation",
|
||||
"created_at": created_at,
|
||||
}))
|
||||
window = window[-1:]
|
||||
return chunks
|
||||
|
||||
def ingest_conversations(path):
|
||||
print(f"\nIngesting conversations from {path.name}...")
|
||||
conversations = []
|
||||
|
||||
# Handle both .json (array) and .jsonl (one per line)
|
||||
raw = path.read_text(encoding="utf-8").strip()
|
||||
if raw.startswith("["):
|
||||
conversations = json.loads(raw)
|
||||
else:
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
conversations.append(json.loads(line))
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f"Found {len(conversations)} conversations")
|
||||
total = 0
|
||||
skipped = 0
|
||||
|
||||
for convo in conversations:
|
||||
chunks = chunk_conversation(convo)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ids = [c[0] for c in chunks]
|
||||
texts = [c[1] for c in chunks]
|
||||
metas = [c[2] for c in chunks]
|
||||
|
||||
existing = collection.get(ids=ids)
|
||||
existing_ids = set(existing["ids"])
|
||||
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
|
||||
if id not in existing_ids]
|
||||
|
||||
if not new:
|
||||
continue
|
||||
|
||||
embeddings = embedder.encode([n[1] for n in new]).tolist()
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
|
||||
cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
type = EXCLUDED.type,
|
||||
created_at = EXCLUDED.created_at,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
chunk_id, chunk_text, embedding,
|
||||
meta.get('source'), meta.get('type'), meta.get('created_at'),
|
||||
json_module.dumps(meta)
|
||||
))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
total += len(new)
|
||||
|
||||
print(f"Conversations: {total} chunks added, {skipped} skipped")
|
||||
return total
|
||||
|
||||
def ingest_memories(path):
|
||||
print(f"\nIngesting memories from {path.name}...")
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
# Memories are a list of memory objects
|
||||
memories = raw if isinstance(raw, list) else raw.get("memories", [])
|
||||
if not memories:
|
||||
print("No memories found")
|
||||
return 0
|
||||
|
||||
# Combine all memories into one chunk — they're already distilled
|
||||
memory_text = "\n".join([
|
||||
f"- {m.get('content', m) if isinstance(m, dict) else m}"
|
||||
for m in memories
|
||||
])
|
||||
|
||||
chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
|
||||
chunk_id = "claude_memories_consolidated"
|
||||
|
||||
existing = collection.get(ids=[chunk_id])
|
||||
if existing["ids"]:
|
||||
# Update by deleting and re-adding
|
||||
collection.delete(ids=[chunk_id])
|
||||
|
||||
embedding = embedder.encode([chunk_text]).tolist()
|
||||
collection.upsert(
|
||||
ids=[chunk_id],
|
||||
documents=[chunk_text],
|
||||
metadatas=[{
|
||||
"source": "Claude: Memory",
|
||||
"type": "claude_memory",
|
||||
}],
|
||||
embeddings=embedding,
|
||||
)
|
||||
|
||||
print(f"Memories: 1 chunk added ({len(memories)} memory items)")
|
||||
return 1
|
||||
|
||||
# Run ingestion
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
total = 0
|
||||
|
||||
conv_files = list(export_dir.glob("conversations.*"))
|
||||
for f in conv_files:
|
||||
total += ingest_conversations(f)
|
||||
|
||||
mem_files = list(export_dir.glob("memories.*"))
|
||||
for f in mem_files:
|
||||
total += ingest_memories(f)
|
||||
|
||||
if total == 0:
|
||||
print("\nNo files found or no new chunks to add.")
|
||||
else:
|
||||
print(f"\nTotal chunks added to corpus: {total}")
|
||||
|
||||
# Show updated corpus size
|
||||
count = collection.count()
|
||||
print(f"Corpus now contains {count} total chunks")
|
||||
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Aaron AI — Migration: pgvector to Graphiti
|
||||
One-time migration. Test with limit first: python3 migrate_to_graphiti.py 100
|
||||
"""
|
||||
import os, sys, json, time, requests, psycopg2
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
GROUP_ID = "aaron"
|
||||
BATCH_PAUSE = 0.5
|
||||
PROGRESS_FILE = Path.home() / "aaronai" / "migration_progress.json"
|
||||
|
||||
def load_progress():
|
||||
if PROGRESS_FILE.exists():
|
||||
return json.loads(PROGRESS_FILE.read_text())
|
||||
return {"completed_ids": [], "failed_ids": []}
|
||||
|
||||
def save_progress(progress):
|
||||
PROGRESS_FILE.write_text(json.dumps(progress, indent=2))
|
||||
|
||||
def migrate(limit=None):
|
||||
try:
|
||||
resp = requests.get(f"{GRAPHITI_URL}/health", timeout=5)
|
||||
print(f"Graphiti: {resp.json()}")
|
||||
except Exception as e:
|
||||
print(f"ERROR: sidecar not reachable — {e}"); sys.exit(1)
|
||||
|
||||
progress = load_progress()
|
||||
completed_ids = set(progress["completed_ids"])
|
||||
failed_ids = progress["failed_ids"]
|
||||
if completed_ids:
|
||||
print(f"Resuming — {len(completed_ids)} done, {len(failed_ids)} failed")
|
||||
|
||||
pg = psycopg2.connect(PG_DSN)
|
||||
cur = pg.cursor()
|
||||
query = "SELECT id, document, source, created_at FROM embeddings ORDER BY created_at ASC"
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
cur.execute(query)
|
||||
rows = cur.fetchall()
|
||||
pg.close()
|
||||
|
||||
pending = [r for r in rows if r[0] not in completed_ids]
|
||||
print(f"Total: {len(rows)} | Pending: {len(pending)}{' [TEST]' if limit else ''}\n")
|
||||
|
||||
success = len(completed_ids)
|
||||
failed = len(failed_ids)
|
||||
start = time.time()
|
||||
|
||||
for i, (id, document, source, created_at) in enumerate(pending):
|
||||
try:
|
||||
src = (source or "unknown").replace("/", "-").replace(" ", "-")[:80]
|
||||
name = f"{src}-{id[:8]}"
|
||||
requests.post(f"{GRAPHITI_URL}/episodes", json={
|
||||
"name": name,
|
||||
"content": document,
|
||||
"source_description": source or "nextcloud-corpus",
|
||||
"timestamp": created_at or datetime.now().isoformat(),
|
||||
"group_id": GROUP_ID,
|
||||
}, timeout=120).raise_for_status()
|
||||
success += 1
|
||||
progress["completed_ids"].append(id)
|
||||
if success % 10 == 0:
|
||||
save_progress(progress)
|
||||
if (i + 1) % 50 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = (i + 1) / elapsed
|
||||
remaining = (len(pending) - i - 1) / rate if rate > 0 else 0
|
||||
print(f" [{i+1}/{len(pending)}] {success} ok, {failed} failed | ~{remaining/60:.0f} min left")
|
||||
time.sleep(BATCH_PAUSE)
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
progress["failed_ids"].append({"id": id, "error": str(e)})
|
||||
print(f" FAILED {id}: {e}")
|
||||
save_progress(progress)
|
||||
time.sleep(2)
|
||||
|
||||
save_progress(progress)
|
||||
elapsed = time.time() - start
|
||||
print(f"\nDone — {success} ok, {failed} failed, {elapsed/60:.1f} min")
|
||||
if limit and len(pending) > 0:
|
||||
est = (elapsed / len(pending)) * 12915 / 60
|
||||
print(f"Estimated full run: ~{est:.0f} min")
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate(int(sys.argv[1]) if len(sys.argv) > 1 else None)
|
||||
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
Migration: ChromaDB → pgvector
|
||||
Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
|
||||
Keeps ChromaDB intact as backup until migration is verified.
|
||||
"""
|
||||
import sqlite3
|
||||
import psycopg2
|
||||
import json
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
|
||||
import os
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN environment variable not set")
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
print("Connecting to databases...")
|
||||
chroma = sqlite3.connect(CHROMA_SQLITE)
|
||||
chroma.row_factory = sqlite3.Row
|
||||
c = chroma.cursor()
|
||||
|
||||
pg = psycopg2.connect(PG_DSN)
|
||||
pg_cur = pg.cursor()
|
||||
|
||||
# Get all documents with their metadata from ChromaDB
|
||||
print("Reading documents from ChromaDB...")
|
||||
c.execute("""
|
||||
SELECT
|
||||
e.id as row_id,
|
||||
e.embedding_id,
|
||||
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
|
||||
MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
|
||||
MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
|
||||
MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
|
||||
FROM embeddings e
|
||||
LEFT JOIN embedding_metadata em ON e.id = em.id
|
||||
GROUP BY e.id, e.embedding_id
|
||||
HAVING document IS NOT NULL
|
||||
ORDER BY e.id
|
||||
""")
|
||||
|
||||
rows = c.fetchall()
|
||||
print(f"Found {len(rows)} documents to migrate")
|
||||
|
||||
# Check existing in PostgreSQL
|
||||
pg_cur.execute("SELECT id FROM embeddings")
|
||||
existing_ids = set(r[0] for r in pg_cur.fetchall())
|
||||
print(f"Already in PostgreSQL: {len(existing_ids)}")
|
||||
|
||||
# Filter to only new ones
|
||||
to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
|
||||
print(f"Need to migrate: {len(to_migrate)}")
|
||||
|
||||
if not to_migrate:
|
||||
print("Nothing to migrate — already complete")
|
||||
chroma.close()
|
||||
pg.close()
|
||||
exit(0)
|
||||
|
||||
# Migrate in batches
|
||||
batch_size = 200
|
||||
migrated = 0
|
||||
errors = 0
|
||||
|
||||
for i in range(0, len(to_migrate), batch_size):
|
||||
batch = to_migrate[i:i+batch_size]
|
||||
|
||||
# Generate embeddings
|
||||
texts = [r['document'] for r in batch]
|
||||
try:
|
||||
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
|
||||
except Exception as e:
|
||||
print(f"Embedding error at batch {i}: {e}")
|
||||
errors += len(batch)
|
||||
continue
|
||||
|
||||
# Insert into PostgreSQL
|
||||
for row, embedding in zip(batch, embeddings):
|
||||
try:
|
||||
pg_cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
type = EXCLUDED.type,
|
||||
created_at = EXCLUDED.created_at,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
row['embedding_id'],
|
||||
row['document'],
|
||||
embedding,
|
||||
row['source'],
|
||||
row['type'],
|
||||
row['created_at'],
|
||||
json.dumps({
|
||||
'source': row['source'],
|
||||
'type': row['type'],
|
||||
'created_at': row['created_at'],
|
||||
})
|
||||
))
|
||||
migrated += 1
|
||||
except Exception as e:
|
||||
print(f"Insert error for {row['embedding_id']}: {e}")
|
||||
errors += 1
|
||||
|
||||
pg.commit()
|
||||
print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
|
||||
|
||||
# Final count
|
||||
pg_cur.execute("SELECT COUNT(*) FROM embeddings")
|
||||
final_count = pg_cur.fetchone()[0]
|
||||
|
||||
chroma.close()
|
||||
pg.close()
|
||||
|
||||
print(f"\nMigration complete:")
|
||||
print(f" Migrated: {migrated}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" PostgreSQL total: {final_count}")
|
||||
Reference in New Issue
Block a user