Add Claude conversation export ingestion script

2026-04-26 13:10:02 -04:00
parent 49a0d8ebc5
commit 619a9295ce
1 changed files with 135 additions and 0 deletions
@@ -0,0 +1,135 @@
 import json
 import sys
 from pathlib import Path
 from datetime import datetime
 from sentence_transformers import SentenceTransformer
 import chromadb
 # Paths
 db_path = str(Path.home() / "aaronai" / "db")
 EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
 print("Loading embedding model...")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 client = chromadb.PersistentClient(path=db_path)
 collection = client.get_or_create_collection(
    name="aaronai",
    metadata={"hnsw:space": "cosine"}
 )
 def extract_messages(convo):
    """Extract messages from a Claude conversation object."""
    messages = []
    for msg in convo.get("chat_messages", []):
        role = msg.get("sender", "")
        if role not in ["human", "assistant"]:
            continue
        # Claude export stores content as a list of content blocks
        content = msg.get("content", [])
        text = ""
        if isinstance(content, str):
            text = content
        elif isinstance(content, list):
            for block in content:
                if isinstance(block, dict) and block.get("type") == "text":
                    text += block.get("text", "")
                elif isinstance(block, str):
                    text += block
        text = text.strip()
        if not text:
            continue
        created_at = msg.get("created_at", "")
        messages.append((created_at, role, text))
    return messages
 def chunk_conversation(convo):
    """Turn a conversation into indexable chunks."""
    chunks = []
    title = convo.get("name", "Untitled conversation")
    uuid = convo.get("uuid", "")
    created_at = convo.get("created_at", "")
    messages = extract_messages(convo)
    if not messages:
        return chunks
    # Chunk into sliding windows of 3 messages
    window = []
    for i, (ts, role, text) in enumerate(messages):
        label = "You" if role == "human" else "Claude"
        window.append(f"{label}: {text}")
        if len(window) >= 3 or i == len(messages) - 1:
            chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
            chunk_id = f"claude_{uuid}_{i}"
            chunks.append((chunk_id, chunk_text, {
                "source": f"Claude: {title}",
                "type": "claude_conversation",
                "created_at": created_at,
            }))
            window = window[-1:]  # overlap by 1
    return chunks
 def ingest_file(jsonl_path):
    print(f"Processing {jsonl_path.name}...")
    conversations = []
    with open(jsonl_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                conversations.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    print(f"Found {len(conversations)} conversations")
    total_chunks = 0
    skipped = 0
    for convo in conversations:
        chunks = chunk_conversation(convo)
        if not chunks:
            skipped += 1
            continue
        ids = [c[0] for c in chunks]
        texts = [c[1] for c in chunks]
        metas = [c[2] for c in chunks]
        # Check existing
        existing = collection.get(ids=ids)
        existing_ids = set(existing["ids"])
        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas) if id not in existing_ids]
        if not new:
            continue
        embeddings = embedder.encode([n[1] for n in new]).tolist()
        collection.add(
            ids=[n[0] for n in new],
            documents=[n[1] for n in new],
            metadatas=[n[2] for n in new],
            embeddings=embeddings,
        )
        total_chunks += len(new)
    print(f"Done. {total_chunks} chunks added, {skipped} conversations skipped.")
    return total_chunks
 # Find the export file
 export_dir = Path(EXPORT_DIR)
 export_dir.mkdir(parents=True, exist_ok=True)
 jsonl_files = list(export_dir.glob("*.jsonl")) + list(export_dir.glob("**/*.jsonl"))
 if not jsonl_files:
    print(f"No .jsonl files found in {EXPORT_DIR}")
    print("Place your Claude export conversations.jsonl file there and run again.")
    sys.exit(0)
 total = 0
 for f in jsonl_files:
    total += ingest_file(f)
 print(f"\nTotal chunks added to corpus: {total}")
 print(f"Database at: {db_path}")