import json import sys from pathlib import Path from sentence_transformers import SentenceTransformer import chromadb # Paths db_path = str(Path.home() / "aaronai" / "db") EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export" print("Loading embedding model...") embedder = SentenceTransformer("all-MiniLM-L6-v2") client = chromadb.PersistentClient(path=db_path) collection = client.get_or_create_collection( name="aaronai", metadata={"hnsw:space": "cosine"} ) def extract_messages(convo): messages = [] for msg in convo.get("chat_messages", []): role = msg.get("sender", "") if role not in ["human", "assistant"]: continue content = msg.get("content", []) text = "" if isinstance(content, str): text = content elif isinstance(content, list): for block in content: if isinstance(block, dict) and block.get("type") == "text": text += block.get("text", "") elif isinstance(block, str): text += block text = text.strip() if not text: continue messages.append((msg.get("created_at", ""), role, text)) return messages def chunk_conversation(convo): chunks = [] title = convo.get("name", "Untitled conversation") uuid = convo.get("uuid", "") created_at = convo.get("created_at", "") messages = extract_messages(convo) if not messages: return chunks window = [] for i, (ts, role, text) in enumerate(messages): label = "You" if role == "human" else "Claude" window.append(f"{label}: {text}") if len(window) >= 3 or i == len(messages) - 1: chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window) chunk_id = f"claude_{uuid}_{i}" chunks.append((chunk_id, chunk_text, { "source": f"Claude: {title}", "type": "claude_conversation", "created_at": created_at, })) window = window[-1:] return chunks def ingest_conversations(path): print(f"\nIngesting conversations from {path.name}...") conversations = [] # Handle both .json (array) and .jsonl (one per line) raw = path.read_text(encoding="utf-8").strip() if raw.startswith("["): conversations = json.loads(raw) else: for line in raw.splitlines(): line = line.strip() if line: try: conversations.append(json.loads(line)) except: continue print(f"Found {len(conversations)} conversations") total = 0 skipped = 0 for convo in conversations: chunks = chunk_conversation(convo) if not chunks: skipped += 1 continue ids = [c[0] for c in chunks] texts = [c[1] for c in chunks] metas = [c[2] for c in chunks] existing = collection.get(ids=ids) existing_ids = set(existing["ids"]) new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) if id not in existing_ids] if not new: continue embeddings = embedder.encode([n[1] for n in new]).tolist() collection.add( ids=[n[0] for n in new], documents=[n[1] for n in new], metadatas=[n[2] for n in new], embeddings=embeddings, ) total += len(new) print(f"Conversations: {total} chunks added, {skipped} skipped") return total def ingest_memories(path): print(f"\nIngesting memories from {path.name}...") raw = json.loads(path.read_text(encoding="utf-8")) # Memories are a list of memory objects memories = raw if isinstance(raw, list) else raw.get("memories", []) if not memories: print("No memories found") return 0 # Combine all memories into one chunk — they're already distilled memory_text = "\n".join([ f"- {m.get('content', m) if isinstance(m, dict) else m}" for m in memories ]) chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}" chunk_id = "claude_memories_consolidated" existing = collection.get(ids=[chunk_id]) if existing["ids"]: # Update by deleting and re-adding collection.delete(ids=[chunk_id]) embedding = embedder.encode([chunk_text]).tolist() collection.add( ids=[chunk_id], documents=[chunk_text], metadatas=[{ "source": "Claude: Memory", "type": "claude_memory", }], embeddings=embedding, ) print(f"Memories: 1 chunk added ({len(memories)} memory items)") return 1 # Run ingestion export_dir = Path(EXPORT_DIR) total = 0 conv_files = list(export_dir.glob("conversations.*")) for f in conv_files: total += ingest_conversations(f) mem_files = list(export_dir.glob("memories.*")) for f in mem_files: total += ingest_memories(f) if total == 0: print("\nNo files found or no new chunks to add.") else: print(f"\nTotal chunks added to corpus: {total}") # Show updated corpus size count = collection.count() print(f"Corpus now contains {count} total chunks")