Initial commit - Aaron AI v1

2026-04-25 02:05:42 +00:00
commit 22ef40bbaa
6 changed files with 1671 additions and 0 deletions
@@ -0,0 +1,150 @@
+import json
+import sys
+from pathlib import Path
+from datetime import datetime
+from sentence_transformers import SentenceTransformer
+import chromadb
+
+# Paths
+db_path = str(Path.home() / "aaronai" / "db")
+EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
+
+print("Loading embedding model...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+client = chromadb.PersistentClient(path=db_path)
+collection = client.get_or_create_collection(
+    name="aaronai",
+    metadata={"hnsw:space": "cosine"}
+)
+
+def extract_messages(convo):
+    """Extract ordered user/assistant messages from a conversation."""
+    mapping = convo.get("mapping", {})
+    messages = []
+
+    for node in mapping.values():
+        msg = node.get("message")
+        if not msg:
+            continue
+
+        role = msg.get("author", {}).get("role")
+        if role not in ["user", "assistant"]:
+            continue
+
+        content = msg.get("content", {})
+        parts = content.get("parts", [])
+
+        # Extract text parts only
+        text = ""
+        for part in parts:
+            if isinstance(part, str):
+                text += part
+            elif isinstance(part, dict) and part.get("content_type") == "text":
+                text += part.get("text", "")
+
+        text = text.strip()
+        if not text:
+            continue
+
+        create_time = msg.get("create_time") or 0
+        messages.append((create_time, role, text))
+
+    # Sort by timestamp
+    messages.sort(key=lambda x: x[0])
+    return messages
+
+def chunk_conversation(title, messages, chunk_size=600, overlap=100):
+    """Convert a conversation into overlapping text chunks."""
+    # Build full conversation text
+    lines = [f"[Conversation: {title}]", ""]
+    for _, role, text in messages:
+        label = "Aaron" if role == "user" else "ChatGPT"
+        lines.append(f"{label}: {text}")
+        lines.append("")
+
+    full_text = "\n".join(lines)
+
+    # Split into word-level chunks with overlap
+    words = full_text.split()
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = start + chunk_size
+        chunk = " ".join(words[start:end])
+        if chunk.strip():
+            chunks.append(chunk)
+        start += chunk_size - overlap
+
+    return chunks
+
+def ingest_file(json_path):
+    print(f"\nLoading {json_path.name}...")
+    data = json.load(open(json_path, encoding="utf-8"))
+    print(f"Found {len(data)} conversations")
+
+    total_chunks = 0
+    skipped = 0
+
+    for i, convo in enumerate(data):
+        title = convo.get("title", "Untitled")
+        convo_id = convo.get("id", f"convo_{i}")
+        create_time = convo.get("create_time", 0)
+
+        try:
+            date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
+        except:
+            date_str = "unknown"
+
+        messages = extract_messages(convo)
+
+        if len(messages) < 2:
+            skipped += 1
+            continue
+
+        chunks = chunk_conversation(title, messages)
+        if not chunks:
+            skipped += 1
+            continue
+
+        # Embed and store
+        embeddings = embedder.encode(chunks).tolist()
+        ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
+        metadatas = [{
+            "source": f"ChatGPT: {title}",
+            "filepath": str(json_path),
+            "date": date_str,
+            "type": "chatgpt_conversation"
+        } for _ in chunks]
+
+        collection.upsert(
+            documents=chunks,
+            embeddings=embeddings,
+            ids=ids,
+            metadatas=metadatas
+        )
+
+        total_chunks += len(chunks)
+        print(f"  [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
+
+    print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
+    return total_chunks
+
+def main():
+    export_dir = Path(EXPORT_DIR)
+    files = [
+        export_dir / "conversations-000.json",
+        export_dir / "conversations-001.json"
+    ]
+
+    grand_total = 0
+    for f in files:
+        if f.exists():
+            grand_total += ingest_file(f)
+        else:
+            print(f"Not found: {f}")
+
+    print(f"\nTotal chunks added to corpus: {grand_total}")
+    print(f"Database at: {db_path}")
+
+if __name__ == "__main__":
+    main()