import json import sys from pathlib import Path from datetime import datetime from sentence_transformers import SentenceTransformer import chromadb # Paths db_path = str(Path.home() / "aaronai" / "db") EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export" print("Loading embedding model...") embedder = SentenceTransformer("all-MiniLM-L6-v2") client = chromadb.PersistentClient(path=db_path) collection = client.get_or_create_collection( name="aaronai", metadata={"hnsw:space": "cosine"} ) def extract_messages(convo): """Extract ordered user/assistant messages from a conversation.""" mapping = convo.get("mapping", {}) messages = [] for node in mapping.values(): msg = node.get("message") if not msg: continue role = msg.get("author", {}).get("role") if role not in ["user", "assistant"]: continue content = msg.get("content", {}) parts = content.get("parts", []) # Extract text parts only text = "" for part in parts: if isinstance(part, str): text += part elif isinstance(part, dict) and part.get("content_type") == "text": text += part.get("text", "") text = text.strip() if not text: continue create_time = msg.get("create_time") or 0 messages.append((create_time, role, text)) # Sort by timestamp messages.sort(key=lambda x: x[0]) return messages def chunk_conversation(title, messages, chunk_size=600, overlap=100): """Convert a conversation into overlapping text chunks.""" # Build full conversation text lines = [f"[Conversation: {title}]", ""] for _, role, text in messages: label = "Aaron" if role == "user" else "ChatGPT" lines.append(f"{label}: {text}") lines.append("") full_text = "\n".join(lines) # Split into word-level chunks with overlap words = full_text.split() chunks = [] start = 0 while start < len(words): end = start + chunk_size chunk = " ".join(words[start:end]) if chunk.strip(): chunks.append(chunk) start += chunk_size - overlap return chunks def ingest_file(json_path): print(f"\nLoading {json_path.name}...") data = json.load(open(json_path, encoding="utf-8")) print(f"Found {len(data)} conversations") total_chunks = 0 skipped = 0 for i, convo in enumerate(data): title = convo.get("title", "Untitled") convo_id = convo.get("id", f"convo_{i}") create_time = convo.get("create_time", 0) try: date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d") except: date_str = "unknown" messages = extract_messages(convo) if len(messages) < 2: skipped += 1 continue chunks = chunk_conversation(title, messages) if not chunks: skipped += 1 continue # Embed and store embeddings = embedder.encode(chunks).tolist() ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))] metadatas = [{ "source": f"ChatGPT: {title}", "filepath": str(json_path), "date": date_str, "type": "chatgpt_conversation" } for _ in chunks] collection.upsert( documents=chunks, embeddings=embeddings, ids=ids, metadatas=metadatas ) total_chunks += len(chunks) print(f" [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})") print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped") return total_chunks def main(): export_dir = Path(EXPORT_DIR) files = [ export_dir / "conversations-000.json", export_dir / "conversations-001.json" ] grand_total = 0 for f in files: if f.exists(): grand_total += ingest_file(f) else: print(f"Not found: {f}") print(f"\nTotal chunks added to corpus: {grand_total}") print(f"Database at: {db_path}") if __name__ == "__main__": main()