diff --git a/scripts/ingest_claude.py b/scripts/ingest_claude.py index 4cf2245..f3d08e5 100644 --- a/scripts/ingest_claude.py +++ b/scripts/ingest_claude.py @@ -1,7 +1,6 @@ import json import sys from pathlib import Path -from datetime import datetime from sentence_transformers import SentenceTransformer import chromadb @@ -18,13 +17,11 @@ collection = client.get_or_create_collection( ) def extract_messages(convo): - """Extract messages from a Claude conversation object.""" messages = [] for msg in convo.get("chat_messages", []): role = msg.get("sender", "") if role not in ["human", "assistant"]: continue - # Claude export stores content as a list of content blocks content = msg.get("content", []) text = "" if isinstance(content, str): @@ -38,12 +35,10 @@ def extract_messages(convo): text = text.strip() if not text: continue - created_at = msg.get("created_at", "") - messages.append((created_at, role, text)) + messages.append((msg.get("created_at", ""), role, text)) return messages def chunk_conversation(convo): - """Turn a conversation into indexable chunks.""" chunks = [] title = convo.get("name", "Untitled conversation") uuid = convo.get("uuid", "") @@ -52,7 +47,6 @@ def chunk_conversation(convo): if not messages: return chunks - # Chunk into sliding windows of 3 messages window = [] for i, (ts, role, text) in enumerate(messages): label = "You" if role == "human" else "Claude" @@ -65,25 +59,28 @@ def chunk_conversation(convo): "type": "claude_conversation", "created_at": created_at, })) - window = window[-1:] # overlap by 1 - + window = window[-1:] return chunks -def ingest_file(jsonl_path): - print(f"Processing {jsonl_path.name}...") +def ingest_conversations(path): + print(f"\nIngesting conversations from {path.name}...") conversations = [] - with open(jsonl_path, encoding="utf-8") as f: - for line in f: + + # Handle both .json (array) and .jsonl (one per line) + raw = path.read_text(encoding="utf-8").strip() + if raw.startswith("["): + conversations = json.loads(raw) + else: + for line in raw.splitlines(): line = line.strip() - if not line: - continue - try: - conversations.append(json.loads(line)) - except json.JSONDecodeError: - continue + if line: + try: + conversations.append(json.loads(line)) + except: + continue print(f"Found {len(conversations)} conversations") - total_chunks = 0 + total = 0 skipped = 0 for convo in conversations: @@ -92,14 +89,14 @@ def ingest_file(jsonl_path): skipped += 1 continue - ids = [c[0] for c in chunks] - texts = [c[1] for c in chunks] - metas = [c[2] for c in chunks] + ids = [c[0] for c in chunks] + texts = [c[1] for c in chunks] + metas = [c[2] for c in chunks] - # Check existing - existing = collection.get(ids=ids) + existing = collection.get(ids=ids) existing_ids = set(existing["ids"]) - new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) if id not in existing_ids] + new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) + if id not in existing_ids] if not new: continue @@ -111,25 +108,66 @@ def ingest_file(jsonl_path): metadatas=[n[2] for n in new], embeddings=embeddings, ) - total_chunks += len(new) + total += len(new) - print(f"Done. {total_chunks} chunks added, {skipped} conversations skipped.") - return total_chunks + print(f"Conversations: {total} chunks added, {skipped} skipped") + return total -# Find the export file +def ingest_memories(path): + print(f"\nIngesting memories from {path.name}...") + raw = json.loads(path.read_text(encoding="utf-8")) + + # Memories are a list of memory objects + memories = raw if isinstance(raw, list) else raw.get("memories", []) + if not memories: + print("No memories found") + return 0 + + # Combine all memories into one chunk — they're already distilled + memory_text = "\n".join([ + f"- {m.get('content', m) if isinstance(m, dict) else m}" + for m in memories + ]) + + chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}" + chunk_id = "claude_memories_consolidated" + + existing = collection.get(ids=[chunk_id]) + if existing["ids"]: + # Update by deleting and re-adding + collection.delete(ids=[chunk_id]) + + embedding = embedder.encode([chunk_text]).tolist() + collection.add( + ids=[chunk_id], + documents=[chunk_text], + metadatas=[{ + "source": "Claude: Memory", + "type": "claude_memory", + }], + embeddings=embedding, + ) + + print(f"Memories: 1 chunk added ({len(memories)} memory items)") + return 1 + +# Run ingestion export_dir = Path(EXPORT_DIR) -export_dir.mkdir(parents=True, exist_ok=True) - -jsonl_files = list(export_dir.glob("*.jsonl")) + list(export_dir.glob("**/*.jsonl")) - -if not jsonl_files: - print(f"No .jsonl files found in {EXPORT_DIR}") - print("Place your Claude export conversations.jsonl file there and run again.") - sys.exit(0) - total = 0 -for f in jsonl_files: - total += ingest_file(f) -print(f"\nTotal chunks added to corpus: {total}") -print(f"Database at: {db_path}") +conv_files = list(export_dir.glob("conversations.*")) +for f in conv_files: + total += ingest_conversations(f) + +mem_files = list(export_dir.glob("memories.*")) +for f in mem_files: + total += ingest_memories(f) + +if total == 0: + print("\nNo files found or no new chunks to add.") +else: + print(f"\nTotal chunks added to corpus: {total}") + +# Show updated corpus size +count = collection.count() +print(f"Corpus now contains {count} total chunks")