aaronAI/scripts/ingest_chatgpt.py

import json
import sys
from pathlib import Path
from datetime import datetime
from sentence_transformers import SentenceTransformer
import chromadb

# Paths
db_path = str(Path.home() / "aaronai" / "db")
EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"

print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
    name="aaronai",
    metadata={"hnsw:space": "cosine"}
)

def extract_messages(convo):
    """Extract ordered user/assistant messages from a conversation."""
    mapping = convo.get("mapping", {})
    messages = []

    for node in mapping.values():
        msg = node.get("message")
        if not msg:
            continue

        role = msg.get("author", {}).get("role")
        if role not in ["user", "assistant"]:
            continue

        content = msg.get("content", {})
        parts = content.get("parts", [])

        # Extract text parts only
        text = ""
        for part in parts:
            if isinstance(part, str):
                text += part
            elif isinstance(part, dict) and part.get("content_type") == "text":
                text += part.get("text", "")

        text = text.strip()
        if not text:
            continue

        create_time = msg.get("create_time") or 0
        messages.append((create_time, role, text))

    # Sort by timestamp
    messages.sort(key=lambda x: x[0])
    return messages

def chunk_conversation(title, messages, chunk_size=600, overlap=100):
    """Convert a conversation into overlapping text chunks."""
    # Build full conversation text
    lines = [f"[Conversation: {title}]", ""]
    for _, role, text in messages:
        label = "Aaron" if role == "user" else "ChatGPT"
        lines.append(f"{label}: {text}")
        lines.append("")

    full_text = "\n".join(lines)

    # Split into word-level chunks with overlap
    words = full_text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        if chunk.strip():
            chunks.append(chunk)
        start += chunk_size - overlap

    return chunks

def ingest_file(json_path):
    print(f"\nLoading {json_path.name}...")
    data = json.load(open(json_path, encoding="utf-8"))
    print(f"Found {len(data)} conversations")

    total_chunks = 0
    skipped = 0

    for i, convo in enumerate(data):
        title = convo.get("title", "Untitled")
        convo_id = convo.get("id", f"convo_{i}")
        create_time = convo.get("create_time", 0)

        try:
            date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
        except:
            date_str = "unknown"

        messages = extract_messages(convo)

        if len(messages) < 2:
            skipped += 1
            continue

        chunks = chunk_conversation(title, messages)
        if not chunks:
            skipped += 1
            continue

        # Embed and store
        embeddings = embedder.encode(chunks).tolist()
        ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
        metadatas = [{
            "source": f"ChatGPT: {title}",
            "filepath": str(json_path),
            "date": date_str,
            "type": "chatgpt_conversation"
        } for _ in chunks]

        collection.upsert(
            documents=chunks,
            embeddings=embeddings,
            ids=ids,
            metadatas=metadatas
        )

        total_chunks += len(chunks)
        print(f"  [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")

    print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
    return total_chunks

def main():
    export_dir = Path(EXPORT_DIR)
    files = [
        export_dir / "conversations-000.json",
        export_dir / "conversations-001.json"
    ]

    grand_total = 0
    for f in files:
        if f.exists():
            grand_total += ingest_file(f)
        else:
            print(f"Not found: {f}")

    print(f"\nTotal chunks added to corpus: {grand_total}")
    print(f"Database at: {db_path}")

if __name__ == "__main__":
    main()