Update ingest_claude.py — handle .json format, ingest memories, handle both array and jsonl

2026-04-26 19:13:17 +00:00
parent 679222ad5f
commit 08b95f1c0c
1 changed files with 81 additions and 43 deletions
@@ -1,7 +1,6 @@
 import json
 import sys
 from pathlib import Path
-from datetime import datetime
 from sentence_transformers import SentenceTransformer
 import chromadb

@@ -18,13 +17,11 @@ collection = client.get_or_create_collection(
 )

 def extract_messages(convo):
-    """Extract messages from a Claude conversation object."""
    messages = []
    for msg in convo.get("chat_messages", []):
        role = msg.get("sender", "")
        if role not in ["human", "assistant"]:
            continue
-        # Claude export stores content as a list of content blocks
        content = msg.get("content", [])
        text = ""
        if isinstance(content, str):
@@ -38,12 +35,10 @@ def extract_messages(convo):
        text = text.strip()
        if not text:
            continue
-        created_at = msg.get("created_at", "")
-        messages.append((created_at, role, text))
+        messages.append((msg.get("created_at", ""), role, text))
    return messages

 def chunk_conversation(convo):
-    """Turn a conversation into indexable chunks."""
    chunks = []
    title = convo.get("name", "Untitled conversation")
    uuid = convo.get("uuid", "")
@@ -52,7 +47,6 @@ def chunk_conversation(convo):
    if not messages:
        return chunks

-    # Chunk into sliding windows of 3 messages
    window = []
    for i, (ts, role, text) in enumerate(messages):
        label = "You" if role == "human" else "Claude"
@@ -65,25 +59,28 @@ def chunk_conversation(convo):
                "type": "claude_conversation",
                "created_at": created_at,
            }))
-            window = window[-1:]  # overlap by 1
-
+            window = window[-1:]
    return chunks

-def ingest_file(jsonl_path):
-    print(f"Processing {jsonl_path.name}...")
+def ingest_conversations(path):
+    print(f"\nIngesting conversations from {path.name}...")
    conversations = []
-    with open(jsonl_path, encoding="utf-8") as f:
-        for line in f:
+
+    # Handle both .json (array) and .jsonl (one per line)
+    raw = path.read_text(encoding="utf-8").strip()
+    if raw.startswith("["):
+        conversations = json.loads(raw)
+    else:
+        for line in raw.splitlines():
            line = line.strip()
-            if not line:
-                continue
+            if line:
                try:
                    conversations.append(json.loads(line))
-            except json.JSONDecodeError:
+                except:
                    continue

    print(f"Found {len(conversations)} conversations")
-    total_chunks = 0
+    total = 0
    skipped = 0

    for convo in conversations:
@@ -96,10 +93,10 @@ def ingest_file(jsonl_path):
        texts  = [c[1] for c in chunks]
        metas  = [c[2] for c in chunks]

-        # Check existing
        existing    = collection.get(ids=ids)
        existing_ids = set(existing["ids"])
-        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) if id not in existing_ids]
+        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
+               if id not in existing_ids]

        if not new:
            continue
@@ -111,25 +108,66 @@ def ingest_file(jsonl_path):
            metadatas=[n[2] for n in new],
            embeddings=embeddings,
        )
-        total_chunks += len(new)
+        total += len(new)

-    print(f"Done. {total_chunks} chunks added, {skipped} conversations skipped.")
-    return total_chunks
+    print(f"Conversations: {total} chunks added, {skipped} skipped")
+    return total

-# Find the export file
+def ingest_memories(path):
+    print(f"\nIngesting memories from {path.name}...")
+    raw = json.loads(path.read_text(encoding="utf-8"))
+
+    # Memories are a list of memory objects
+    memories = raw if isinstance(raw, list) else raw.get("memories", [])
+    if not memories:
+        print("No memories found")
+        return 0
+
+    # Combine all memories into one chunk — they're already distilled
+    memory_text = "\n".join([
+        f"- {m.get('content', m) if isinstance(m, dict) else m}"
+        for m in memories
+    ])
+
+    chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
+    chunk_id = "claude_memories_consolidated"
+
+    existing = collection.get(ids=[chunk_id])
+    if existing["ids"]:
+        # Update by deleting and re-adding
+        collection.delete(ids=[chunk_id])
+
+    embedding = embedder.encode([chunk_text]).tolist()
+    collection.add(
+        ids=[chunk_id],
+        documents=[chunk_text],
+        metadatas=[{
+            "source": "Claude: Memory",
+            "type": "claude_memory",
+        }],
+        embeddings=embedding,
+    )
+
+    print(f"Memories: 1 chunk added ({len(memories)} memory items)")
+    return 1
+
+# Run ingestion
 export_dir = Path(EXPORT_DIR)
-export_dir.mkdir(parents=True, exist_ok=True)
-
-jsonl_files = list(export_dir.glob("*.jsonl")) + list(export_dir.glob("**/*.jsonl"))
-
-if not jsonl_files:
-    print(f"No .jsonl files found in {EXPORT_DIR}")
-    print("Place your Claude export conversations.jsonl file there and run again.")
-    sys.exit(0)
-
 total = 0
-for f in jsonl_files:
-    total += ingest_file(f)

+conv_files = list(export_dir.glob("conversations.*"))
+for f in conv_files:
+    total += ingest_conversations(f)
+
+mem_files = list(export_dir.glob("memories.*"))
+for f in mem_files:
+    total += ingest_memories(f)
+
+if total == 0:
+    print("\nNo files found or no new chunks to add.")
+else:
    print(f"\nTotal chunks added to corpus: {total}")
-print(f"Database at: {db_path}")
+
+# Show updated corpus size
+count = collection.count()
+print(f"Corpus now contains {count} total chunks")