chore: archive deprecated chromadb and migration scripts

2026-04-28 00:15:46 +00:00
parent d5b5c2ec14
commit 037d747573
10 changed files with 486 additions and 11 deletions
@@ -0,0 +1,189 @@
+import json
+import sys
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+import psycopg2
+import psycopg2.extras
+import json as json_module
+
+# Paths
+db_path = str(Path.home() / "aaronai" / "db")
+EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
+
+print("Loading embedding model...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+client = chromadb.PersistentClient(path=db_path)
+collection = client.get_or_create_collection(
+    name="aaronai",
+    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
+)
+
+def extract_messages(convo):
+    messages = []
+    for msg in convo.get("chat_messages", []):
+        role = msg.get("sender", "")
+        if role not in ["human", "assistant"]:
+            continue
+        content = msg.get("content", [])
+        text = ""
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            for block in content:
+                if isinstance(block, dict) and block.get("type") == "text":
+                    text += block.get("text", "")
+                elif isinstance(block, str):
+                    text += block
+        text = text.strip()
+        if not text:
+            continue
+        messages.append((msg.get("created_at", ""), role, text))
+    return messages
+
+def chunk_conversation(convo):
+    chunks = []
+    title = convo.get("name", "Untitled conversation")
+    uuid = convo.get("uuid", "")
+    created_at = convo.get("created_at", "")
+    messages = extract_messages(convo)
+    if not messages:
+        return chunks
+
+    window = []
+    for i, (ts, role, text) in enumerate(messages):
+        label = "You" if role == "human" else "Claude"
+        window.append(f"{label}: {text}")
+        if len(window) >= 3 or i == len(messages) - 1:
+            chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
+            chunk_id = f"claude_{uuid}_{i}"
+            chunks.append((chunk_id, chunk_text, {
+                "source": f"Claude: {title}",
+                "type": "claude_conversation",
+                "created_at": created_at,
+            }))
+            window = window[-1:]
+    return chunks
+
+def ingest_conversations(path):
+    print(f"\nIngesting conversations from {path.name}...")
+    conversations = []
+
+    # Handle both .json (array) and .jsonl (one per line)
+    raw = path.read_text(encoding="utf-8").strip()
+    if raw.startswith("["):
+        conversations = json.loads(raw)
+    else:
+        for line in raw.splitlines():
+            line = line.strip()
+            if line:
+                try:
+                    conversations.append(json.loads(line))
+                except:
+                    continue
+
+    print(f"Found {len(conversations)} conversations")
+    total = 0
+    skipped = 0
+
+    for convo in conversations:
+        chunks = chunk_conversation(convo)
+        if not chunks:
+            skipped += 1
+            continue
+
+        ids    = [c[0] for c in chunks]
+        texts  = [c[1] for c in chunks]
+        metas  = [c[2] for c in chunks]
+
+        existing    = collection.get(ids=ids)
+        existing_ids = set(existing["ids"])
+        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
+               if id not in existing_ids]
+
+        if not new:
+            continue
+
+        embeddings = embedder.encode([n[1] for n in new]).tolist()
+        pg = get_pg()
+        cur = pg.cursor()
+        for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
+            cur.execute("""
+                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
+                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
+                ON CONFLICT (id) DO UPDATE SET
+                    document = EXCLUDED.document,
+                    embedding = EXCLUDED.embedding,
+                    source = EXCLUDED.source,
+                    type = EXCLUDED.type,
+                    created_at = EXCLUDED.created_at,
+                    metadata = EXCLUDED.metadata
+            """, (
+                chunk_id, chunk_text, embedding,
+                meta.get('source'), meta.get('type'), meta.get('created_at'),
+                json_module.dumps(meta)
+            ))
+        pg.commit()
+        pg.close()
+        total += len(new)
+
+    print(f"Conversations: {total} chunks added, {skipped} skipped")
+    return total
+
+def ingest_memories(path):
+    print(f"\nIngesting memories from {path.name}...")
+    raw = json.loads(path.read_text(encoding="utf-8"))
+
+    # Memories are a list of memory objects
+    memories = raw if isinstance(raw, list) else raw.get("memories", [])
+    if not memories:
+        print("No memories found")
+        return 0
+
+    # Combine all memories into one chunk — they're already distilled
+    memory_text = "\n".join([
+        f"- {m.get('content', m) if isinstance(m, dict) else m}"
+        for m in memories
+    ])
+
+    chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
+    chunk_id = "claude_memories_consolidated"
+
+    existing = collection.get(ids=[chunk_id])
+    if existing["ids"]:
+        # Update by deleting and re-adding
+        collection.delete(ids=[chunk_id])
+
+    embedding = embedder.encode([chunk_text]).tolist()
+    collection.upsert(
+        ids=[chunk_id],
+        documents=[chunk_text],
+        metadatas=[{
+            "source": "Claude: Memory",
+            "type": "claude_memory",
+        }],
+        embeddings=embedding,
+    )
+
+    print(f"Memories: 1 chunk added ({len(memories)} memory items)")
+    return 1
+
+# Run ingestion
+export_dir = Path(EXPORT_DIR)
+total = 0
+
+conv_files = list(export_dir.glob("conversations.*"))
+for f in conv_files:
+    total += ingest_conversations(f)
+
+mem_files = list(export_dir.glob("memories.*"))
+for f in mem_files:
+    total += ingest_memories(f)
+
+if total == 0:
+    print("\nNo files found or no new chunks to add.")
+else:
+    print(f"\nTotal chunks added to corpus: {total}")
+
+# Show updated corpus size
+count = collection.count()
+print(f"Corpus now contains {count} total chunks")