Update ingest_claude.py — handle .json format, ingest memories, handle both array and jsonl

2026-04-26 19:13:17 +00:00
parent 679222ad5f
commit 08b95f1c0c
1 changed files with 81 additions and 43 deletions
@@ -1,7 +1,6 @@
 import json
 import sys
 from pathlib import Path
 from datetime import datetime
 from sentence_transformers import SentenceTransformer
 import chromadb
@@ -18,13 +17,11 @@ collection = client.get_or_create_collection(
 )
 def extract_messages(convo):
    """Extract messages from a Claude conversation object."""
    messages = []
    for msg in convo.get("chat_messages", []):
        role = msg.get("sender", "")
        if role not in ["human", "assistant"]:
            continue
        # Claude export stores content as a list of content blocks
        content = msg.get("content", [])
        text = ""
        if isinstance(content, str):
@@ -38,12 +35,10 @@ def extract_messages(convo):
        text = text.strip()
        if not text:
            continue
-        created_at = msg.get("created_at", "")
+        messages.append((msg.get("created_at", ""), role, text))
        messages.append((created_at, role, text))
    return messages
 def chunk_conversation(convo):
    """Turn a conversation into indexable chunks."""
    chunks = []
    title = convo.get("name", "Untitled conversation")
    uuid = convo.get("uuid", "")
@@ -52,7 +47,6 @@ def chunk_conversation(convo):
    if not messages:
        return chunks
    # Chunk into sliding windows of 3 messages
    window = []
    for i, (ts, role, text) in enumerate(messages):
        label = "You" if role == "human" else "Claude"
@@ -65,25 +59,28 @@ def chunk_conversation(convo):
                "type": "claude_conversation",
                "created_at": created_at,
            }))
-            window = window[-1:]  # overlap by 1
+            window = window[-1:]
    return chunks
-def ingest_file(jsonl_path):
+def ingest_conversations(path):
-    print(f"Processing {jsonl_path.name}...")
+    print(f"\nIngesting conversations from {path.name}...")
    conversations = []
-    with open(jsonl_path, encoding="utf-8") as f:
+
-        for line in f:
+    # Handle both .json (array) and .jsonl (one per line)
    raw = path.read_text(encoding="utf-8").strip()
    if raw.startswith("["):
        conversations = json.loads(raw)
    else:
        for line in raw.splitlines():
            line = line.strip()
-            if not line:
+            if line:
                continue
                try:
                    conversations.append(json.loads(line))
-            except json.JSONDecodeError:
+                except:
                    continue
    print(f"Found {len(conversations)} conversations")
-    total_chunks = 0
+    total = 0
    skipped = 0
    for convo in conversations:
@@ -96,10 +93,10 @@ def ingest_file(jsonl_path):
        texts  = [c[1] for c in chunks]
        metas  = [c[2] for c in chunks]
        # Check existing
        existing    = collection.get(ids=ids)
        existing_ids = set(existing["ids"])
-        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) if id not in existing_ids]
+        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
               if id not in existing_ids]
        if not new:
            continue
@@ -111,25 +108,66 @@ def ingest_file(jsonl_path):
            metadatas=[n[2] for n in new],
            embeddings=embeddings,
        )
-        total_chunks += len(new)
+        total += len(new)
-    print(f"Done. {total_chunks} chunks added, {skipped} conversations skipped.")
+    print(f"Conversations: {total} chunks added, {skipped} skipped")
-    return total_chunks
+    return total
-# Find the export file
+def ingest_memories(path):
    print(f"\nIngesting memories from {path.name}...")
    raw = json.loads(path.read_text(encoding="utf-8"))
    # Memories are a list of memory objects
    memories = raw if isinstance(raw, list) else raw.get("memories", [])
    if not memories:
        print("No memories found")
        return 0
    # Combine all memories into one chunk — they're already distilled
    memory_text = "\n".join([
        f"- {m.get('content', m) if isinstance(m, dict) else m}"
        for m in memories
    ])
    chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
    chunk_id = "claude_memories_consolidated"
    existing = collection.get(ids=[chunk_id])
    if existing["ids"]:
        # Update by deleting and re-adding
        collection.delete(ids=[chunk_id])
    embedding = embedder.encode([chunk_text]).tolist()
    collection.add(
        ids=[chunk_id],
        documents=[chunk_text],
        metadatas=[{
            "source": "Claude: Memory",
            "type": "claude_memory",
        }],
        embeddings=embedding,
    )
    print(f"Memories: 1 chunk added ({len(memories)} memory items)")
    return 1
 # Run ingestion
 export_dir = Path(EXPORT_DIR)
 export_dir.mkdir(parents=True, exist_ok=True)
 jsonl_files = list(export_dir.glob("*.jsonl")) + list(export_dir.glob("**/*.jsonl"))
 if not jsonl_files:
    print(f"No .jsonl files found in {EXPORT_DIR}")
    print("Place your Claude export conversations.jsonl file there and run again.")
    sys.exit(0)
 total = 0
 for f in jsonl_files:
    total += ingest_file(f)
 conv_files = list(export_dir.glob("conversations.*"))
 for f in conv_files:
    total += ingest_conversations(f)
 mem_files = list(export_dir.glob("memories.*"))
 for f in mem_files:
    total += ingest_memories(f)
 if total == 0:
    print("\nNo files found or no new chunks to add.")
 else:
    print(f"\nTotal chunks added to corpus: {total}")
-print(f"Database at: {db_path}")
+
 # Show updated corpus size
 count = collection.count()
 print(f"Corpus now contains {count} total chunks")