chore: archive deprecated chromadb and migration scripts
This commit is contained in:
@@ -0,0 +1,189 @@
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json as json_module
|
||||
|
||||
# Paths
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
|
||||
)
|
||||
|
||||
def extract_messages(convo):
|
||||
messages = []
|
||||
for msg in convo.get("chat_messages", []):
|
||||
role = msg.get("sender", "")
|
||||
if role not in ["human", "assistant"]:
|
||||
continue
|
||||
content = msg.get("content", [])
|
||||
text = ""
|
||||
if isinstance(content, str):
|
||||
text = content
|
||||
elif isinstance(content, list):
|
||||
for block in content:
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
text += block.get("text", "")
|
||||
elif isinstance(block, str):
|
||||
text += block
|
||||
text = text.strip()
|
||||
if not text:
|
||||
continue
|
||||
messages.append((msg.get("created_at", ""), role, text))
|
||||
return messages
|
||||
|
||||
def chunk_conversation(convo):
|
||||
chunks = []
|
||||
title = convo.get("name", "Untitled conversation")
|
||||
uuid = convo.get("uuid", "")
|
||||
created_at = convo.get("created_at", "")
|
||||
messages = extract_messages(convo)
|
||||
if not messages:
|
||||
return chunks
|
||||
|
||||
window = []
|
||||
for i, (ts, role, text) in enumerate(messages):
|
||||
label = "You" if role == "human" else "Claude"
|
||||
window.append(f"{label}: {text}")
|
||||
if len(window) >= 3 or i == len(messages) - 1:
|
||||
chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
|
||||
chunk_id = f"claude_{uuid}_{i}"
|
||||
chunks.append((chunk_id, chunk_text, {
|
||||
"source": f"Claude: {title}",
|
||||
"type": "claude_conversation",
|
||||
"created_at": created_at,
|
||||
}))
|
||||
window = window[-1:]
|
||||
return chunks
|
||||
|
||||
def ingest_conversations(path):
|
||||
print(f"\nIngesting conversations from {path.name}...")
|
||||
conversations = []
|
||||
|
||||
# Handle both .json (array) and .jsonl (one per line)
|
||||
raw = path.read_text(encoding="utf-8").strip()
|
||||
if raw.startswith("["):
|
||||
conversations = json.loads(raw)
|
||||
else:
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
conversations.append(json.loads(line))
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f"Found {len(conversations)} conversations")
|
||||
total = 0
|
||||
skipped = 0
|
||||
|
||||
for convo in conversations:
|
||||
chunks = chunk_conversation(convo)
|
||||
if not chunks:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
ids = [c[0] for c in chunks]
|
||||
texts = [c[1] for c in chunks]
|
||||
metas = [c[2] for c in chunks]
|
||||
|
||||
existing = collection.get(ids=ids)
|
||||
existing_ids = set(existing["ids"])
|
||||
new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
|
||||
if id not in existing_ids]
|
||||
|
||||
if not new:
|
||||
continue
|
||||
|
||||
embeddings = embedder.encode([n[1] for n in new]).tolist()
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
|
||||
cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
type = EXCLUDED.type,
|
||||
created_at = EXCLUDED.created_at,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
chunk_id, chunk_text, embedding,
|
||||
meta.get('source'), meta.get('type'), meta.get('created_at'),
|
||||
json_module.dumps(meta)
|
||||
))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
total += len(new)
|
||||
|
||||
print(f"Conversations: {total} chunks added, {skipped} skipped")
|
||||
return total
|
||||
|
||||
def ingest_memories(path):
|
||||
print(f"\nIngesting memories from {path.name}...")
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
# Memories are a list of memory objects
|
||||
memories = raw if isinstance(raw, list) else raw.get("memories", [])
|
||||
if not memories:
|
||||
print("No memories found")
|
||||
return 0
|
||||
|
||||
# Combine all memories into one chunk — they're already distilled
|
||||
memory_text = "\n".join([
|
||||
f"- {m.get('content', m) if isinstance(m, dict) else m}"
|
||||
for m in memories
|
||||
])
|
||||
|
||||
chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
|
||||
chunk_id = "claude_memories_consolidated"
|
||||
|
||||
existing = collection.get(ids=[chunk_id])
|
||||
if existing["ids"]:
|
||||
# Update by deleting and re-adding
|
||||
collection.delete(ids=[chunk_id])
|
||||
|
||||
embedding = embedder.encode([chunk_text]).tolist()
|
||||
collection.upsert(
|
||||
ids=[chunk_id],
|
||||
documents=[chunk_text],
|
||||
metadatas=[{
|
||||
"source": "Claude: Memory",
|
||||
"type": "claude_memory",
|
||||
}],
|
||||
embeddings=embedding,
|
||||
)
|
||||
|
||||
print(f"Memories: 1 chunk added ({len(memories)} memory items)")
|
||||
return 1
|
||||
|
||||
# Run ingestion
|
||||
export_dir = Path(EXPORT_DIR)
|
||||
total = 0
|
||||
|
||||
conv_files = list(export_dir.glob("conversations.*"))
|
||||
for f in conv_files:
|
||||
total += ingest_conversations(f)
|
||||
|
||||
mem_files = list(export_dir.glob("memories.*"))
|
||||
for f in mem_files:
|
||||
total += ingest_memories(f)
|
||||
|
||||
if total == 0:
|
||||
print("\nNo files found or no new chunks to add.")
|
||||
else:
|
||||
print(f"\nTotal chunks added to corpus: {total}")
|
||||
|
||||
# Show updated corpus size
|
||||
count = collection.count()
|
||||
print(f"Corpus now contains {count} total chunks")
|
||||
Reference in New Issue
Block a user