chore: archive deprecated chromadb and migration scripts

2026-04-28 00:15:46 +00:00
parent d5b5c2ec14
commit 037d747573
10 changed files with 486 additions and 11 deletions
@@ -0,0 +1,250 @@
+import os
+import json
+from pathlib import Path
+from dotenv import load_dotenv
+import chromadb
+from sentence_transformers import SentenceTransformer
+import anthropic
+from datetime import datetime
+
+load_dotenv(Path.home() / "aaronai" / ".env")
+
+memory_path = Path.home() / "aaronai" / "memory.md"
+db_path = str(Path.home() / "aaronai" / "db")
+
+print("Loading Aaron AI...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+chroma_client = chromadb.PersistentClient(path=db_path)
+collection = chroma_client.get_or_create_collection(
+    name="aaronai",
+    metadata={"hnsw:space": "cosine"}
+)
+anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+
+SYSTEM_PROMPT = """You are Aaron Nelson's personal AI assistant. Aaron is an Associate Professor
+of Digital Design & Fabrication and Program Director of the Hudson Valley Additive Manufacturing
+Center (HVAMC) at SUNY New Paltz. He is an expert in computational design, additive manufacturing,
+and digital fabrication with deep fluency in Rhino, Grasshopper, Stratasys FDM, PolyJet, and metal
+3D printing workflows. He runs a commercial venture called Mossygear and a consulting operation
+called FWN3D. He has a background in graffiti lettering and vector illustration.
+
+You have been provided with relevant excerpts from Aaron's own documents and his persistent memory.
+Use this context to give answers grounded in his actual work and history. When helping him write
+or create, match his voice and draw on his existing materials. Be direct and specific -
+Aaron values precision over padding. Always cite which documents you drew from when relevant.
+
+You have access to web search. Use it automatically when:
+- Questions require current data (salaries, job postings, prices, news)
+- Questions reference specific institutions, people, or organizations you need to verify
+- Aaron's documents and memory don't contain sufficient information to answer well
+Do not announce that you are searching. Just search and incorporate results naturally."""
+
+CV_SOURCES = ["Aaron Nelson CV 2024.pdf"]
+conversation_history = []
+
+TOOLS = [
+    {
+        "type": "web_search_20250305",
+        "name": "web_search"
+    }
+]
+
+def load_memory():
+    if memory_path.exists():
+        return memory_path.read_text(encoding="utf-8")
+    return ""
+
+def save_memory(content):
+    memory_path.write_text(content, encoding="utf-8")
+
+def add_to_memory(new_item):
+    memory = load_memory()
+    timestamp = datetime.now().strftime("%Y-%m-%d")
+    note = f"\n- [{timestamp}] {new_item}"
+    if "## Notes" not in memory:
+        memory += "\n\n## Notes"
+    memory += note
+    save_memory(memory)
+
+def remove_from_memory(item):
+    memory = load_memory()
+    lines = memory.split("\n")
+    filtered = [l for l in lines if item.lower() not in l.lower()]
+    save_memory("\n".join(filtered))
+    return len(lines) - len(filtered)
+
+def get_pinned_cv_context():
+    results = collection.get(
+        where={"source": "Aaron Nelson CV 2024.pdf"},
+        include=["documents", "metadatas"]
+    )
+    return results["documents"], results["metadatas"]
+
+def is_professional_query(query):
+    keywords = [
+        "grant", "publication", "exhibition", "award", "fellowship",
+        "experience", "position", "job", "career", "cv", "resume",
+        "research", "work history", "accomplishment", "teaching",
+        "course", "client", "consultation", "presentation", "workshop",
+        "education", "degree", "institution", "service", "committee"
+    ]
+    return any(keyword in query.lower() for keyword in keywords)
+
+def retrieve_context(query, n_results=8):
+    query_embedding = embedder.encode([query]).tolist()
+    results = collection.query(
+        query_embeddings=query_embedding,
+        n_results=n_results,
+        include=["documents", "metadatas", "distances"]
+    )
+
+    context_pieces = []
+    sources = []
+
+    if is_professional_query(query):
+        cv_docs, cv_metas = get_pinned_cv_context()
+        for doc, meta in zip(cv_docs, cv_metas):
+            context_pieces.append(f"[CV] {doc}")
+            sources.append(meta["source"])
+
+    for doc, meta, dist in zip(
+        results["documents"][0],
+        results["metadatas"][0],
+        results["distances"][0]
+    ):
+        relevance = 1 - dist
+        if relevance > 0.3 and meta["source"] not in CV_SOURCES:
+            context_pieces.append(doc)
+            sources.append(meta["source"])
+
+    return context_pieces, sources
+
+def handle_command(user_input):
+    stripped = user_input.strip().lower()
+
+    if stripped == "show memory":
+        memory = load_memory()
+        print(f"\nAaron AI: Current memory:\n\n{memory}")
+        return True
+
+    if stripped.startswith("remember:"):
+        item = user_input[9:].strip()
+        add_to_memory(item)
+        print(f"\nAaron AI: Saved to memory: '{item}'")
+        return True
+
+    if stripped.startswith("forget:"):
+        item = user_input[7:].strip()
+        removed = remove_from_memory(item)
+        if removed:
+            print(f"\nAaron AI: Removed {removed} line(s) containing '{item}' from memory.")
+        else:
+            print(f"\nAaron AI: Nothing found in memory containing '{item}'.")
+        return True
+
+    if stripped == "clear":
+        conversation_history.clear()
+        print("\nAaron AI: Conversation history cleared.")
+        return True
+
+    return False
+
+def chat(user_message):
+    memory = load_memory()
+    context_pieces, sources = retrieve_context(user_message)
+
+    context_parts = []
+    if memory:
+        context_parts.append(f"Aaron's persistent memory:\n\n{memory}")
+    if context_pieces:
+        context_str = "\n\n---\n\n".join(context_pieces)
+        unique_sources = list(set(sources))
+        context_parts.append(
+            f"Relevant excerpts from Aaron's documents:\n\n{context_str}\n\nSources: {', '.join(unique_sources)}"
+        )
+
+    context_block = "\n\n====\n\n".join(context_parts) + "\n\n---\n\n" if context_parts else ""
+    full_message = context_block + user_message
+
+    # Build messages for this turn
+    messages = conversation_history + [{"role": "user", "content": full_message}]
+
+    # Agentic loop to handle tool use
+    while True:
+        response = anthropic_client.messages.create(
+            model="claude-sonnet-4-6",
+            max_tokens=2048,
+            system=SYSTEM_PROMPT,
+            tools=TOOLS,
+            messages=messages
+        )
+
+        # Check if we need to handle tool calls
+        if response.stop_reason == "tool_use":
+            # Add assistant response to messages
+            messages.append({"role": "assistant", "content": response.content})
+
+            # Process each tool use block
+            tool_results = []
+            for block in response.content:
+                if block.type == "tool_use":
+                    tool_results.append({
+                        "type": "tool_result",
+                        "tool_use_id": block.id,
+                        "content": "Search completed"
+                    })
+
+            # Add tool results and continue
+            messages.append({"role": "user", "content": tool_results})
+
+        else:
+            # Final response - extract text
+            assistant_message = ""
+            for block in response.content:
+                if hasattr(block, "text"):
+                    assistant_message += block.text
+
+            # Update conversation history with clean versions
+            conversation_history.append({"role": "user", "content": full_message})
+            conversation_history.append({"role": "assistant", "content": assistant_message})
+
+            if len(conversation_history) > 20:
+                conversation_history.pop(0)
+                conversation_history.pop(0)
+
+            return assistant_message, sources
+
+def main():
+    print("Aaron AI ready. Corpus, memory, and web search loaded.")
+    print("Commands: 'remember: [fact]' | 'forget: [text]' | 'show memory' | 'clear' | 'quit'")
+    print("=" * 60)
+
+    while True:
+        try:
+            user_input = input("\nYou: ").strip()
+
+            if not user_input:
+                continue
+
+            if user_input.strip().lower() == "quit":
+                print("Goodbye.")
+                break
+
+            if handle_command(user_input):
+                continue
+
+            response, sources = chat(user_input)
+            print(f"\nAaron AI: {response}")
+
+            if sources:
+                unique = list(set(sources))
+                print(f"\n[Sources: {', '.join(unique)}]")
+
+        except KeyboardInterrupt:
+            print("\nGoodbye.")
+            break
+        except Exception as e:
+            print(f"Error: {e}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,152 @@
+import json
+import sys
+from pathlib import Path
+from datetime import datetime
+from sentence_transformers import SentenceTransformer
+import psycopg2
+import psycopg2.extras
+import json as json_module
+
+# Paths
+db_path = str(Path.home() / "aaronai" / "db")
+EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/ChatGPT Export"
+
+print("Loading embedding model...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+client = chromadb.PersistentClient(path=db_path)
+collection = client.get_or_create_collection(
+    name="aaronai",
+    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
+)
+
+def extract_messages(convo):
+    """Extract ordered user/assistant messages from a conversation."""
+    mapping = convo.get("mapping", {})
+    messages = []
+
+    for node in mapping.values():
+        msg = node.get("message")
+        if not msg:
+            continue
+
+        role = msg.get("author", {}).get("role")
+        if role not in ["user", "assistant"]:
+            continue
+
+        content = msg.get("content", {})
+        parts = content.get("parts", [])
+
+        # Extract text parts only
+        text = ""
+        for part in parts:
+            if isinstance(part, str):
+                text += part
+            elif isinstance(part, dict) and part.get("content_type") == "text":
+                text += part.get("text", "")
+
+        text = text.strip()
+        if not text:
+            continue
+
+        create_time = msg.get("create_time") or 0
+        messages.append((create_time, role, text))
+
+    # Sort by timestamp
+    messages.sort(key=lambda x: x[0])
+    return messages
+
+def chunk_conversation(title, messages, chunk_size=600, overlap=100):
+    """Convert a conversation into overlapping text chunks."""
+    # Build full conversation text
+    lines = [f"[Conversation: {title}]", ""]
+    for _, role, text in messages:
+        label = "Aaron" if role == "user" else "ChatGPT"
+        lines.append(f"{label}: {text}")
+        lines.append("")
+
+    full_text = "\n".join(lines)
+
+    # Split into word-level chunks with overlap
+    words = full_text.split()
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = start + chunk_size
+        chunk = " ".join(words[start:end])
+        if chunk.strip():
+            chunks.append(chunk)
+        start += chunk_size - overlap
+
+    return chunks
+
+def ingest_file(json_path):
+    print(f"\nLoading {json_path.name}...")
+    data = json.load(open(json_path, encoding="utf-8"))
+    print(f"Found {len(data)} conversations")
+
+    total_chunks = 0
+    skipped = 0
+
+    for i, convo in enumerate(data):
+        title = convo.get("title", "Untitled")
+        convo_id = convo.get("id", f"convo_{i}")
+        create_time = convo.get("create_time", 0)
+
+        try:
+            date_str = datetime.fromtimestamp(create_time).strftime("%Y-%m-%d")
+        except:
+            date_str = "unknown"
+
+        messages = extract_messages(convo)
+
+        if len(messages) < 2:
+            skipped += 1
+            continue
+
+        chunks = chunk_conversation(title, messages)
+        if not chunks:
+            skipped += 1
+            continue
+
+        # Embed and store
+        embeddings = embedder.encode(chunks).tolist()
+        ids = [f"chatgpt_{convo_id}_{j}" for j in range(len(chunks))]
+        metadatas = [{
+            "source": f"ChatGPT: {title}",
+            "filepath": str(json_path),
+            "date": date_str,
+            "type": "chatgpt_conversation"
+        } for _ in chunks]
+
+        collection.upsert(
+            documents=chunks,
+            embeddings=embeddings,
+            ids=ids,
+            metadatas=metadatas
+        )
+
+        total_chunks += len(chunks)
+        print(f"  [{i+1}/{len(data)}] {title[:60]} — {len(chunks)} chunks ({date_str})")
+
+    print(f"\nDone with {json_path.name}: {total_chunks} chunks indexed, {skipped} conversations skipped")
+    return total_chunks
+
+def main():
+    export_dir = Path(EXPORT_DIR)
+    files = [
+        export_dir / "conversations-000.json",
+        export_dir / "conversations-001.json"
+    ]
+
+    grand_total = 0
+    for f in files:
+        if f.exists():
+            grand_total += ingest_file(f)
+        else:
+            print(f"Not found: {f}")
+
+    print(f"\nTotal chunks added to corpus: {grand_total}")
+    print(f"Database at: {db_path}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,189 @@
+import json
+import sys
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+import psycopg2
+import psycopg2.extras
+import json as json_module
+
+# Paths
+db_path = str(Path.home() / "aaronai" / "db")
+EXPORT_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Archive/Misc/Claude Export"
+
+print("Loading embedding model...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+client = chromadb.PersistentClient(path=db_path)
+collection = client.get_or_create_collection(
+    name="aaronai",
+    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
+)
+
+def extract_messages(convo):
+    messages = []
+    for msg in convo.get("chat_messages", []):
+        role = msg.get("sender", "")
+        if role not in ["human", "assistant"]:
+            continue
+        content = msg.get("content", [])
+        text = ""
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            for block in content:
+                if isinstance(block, dict) and block.get("type") == "text":
+                    text += block.get("text", "")
+                elif isinstance(block, str):
+                    text += block
+        text = text.strip()
+        if not text:
+            continue
+        messages.append((msg.get("created_at", ""), role, text))
+    return messages
+
+def chunk_conversation(convo):
+    chunks = []
+    title = convo.get("name", "Untitled conversation")
+    uuid = convo.get("uuid", "")
+    created_at = convo.get("created_at", "")
+    messages = extract_messages(convo)
+    if not messages:
+        return chunks
+
+    window = []
+    for i, (ts, role, text) in enumerate(messages):
+        label = "You" if role == "human" else "Claude"
+        window.append(f"{label}: {text}")
+        if len(window) >= 3 or i == len(messages) - 1:
+            chunk_text = f"[Claude conversation: {title}]\n\n" + "\n\n".join(window)
+            chunk_id = f"claude_{uuid}_{i}"
+            chunks.append((chunk_id, chunk_text, {
+                "source": f"Claude: {title}",
+                "type": "claude_conversation",
+                "created_at": created_at,
+            }))
+            window = window[-1:]
+    return chunks
+
+def ingest_conversations(path):
+    print(f"\nIngesting conversations from {path.name}...")
+    conversations = []
+
+    # Handle both .json (array) and .jsonl (one per line)
+    raw = path.read_text(encoding="utf-8").strip()
+    if raw.startswith("["):
+        conversations = json.loads(raw)
+    else:
+        for line in raw.splitlines():
+            line = line.strip()
+            if line:
+                try:
+                    conversations.append(json.loads(line))
+                except:
+                    continue
+
+    print(f"Found {len(conversations)} conversations")
+    total = 0
+    skipped = 0
+
+    for convo in conversations:
+        chunks = chunk_conversation(convo)
+        if not chunks:
+            skipped += 1
+            continue
+
+        ids    = [c[0] for c in chunks]
+        texts  = [c[1] for c in chunks]
+        metas  = [c[2] for c in chunks]
+
+        existing    = collection.get(ids=ids)
+        existing_ids = set(existing["ids"])
+        new = [(id, txt, meta) for id, txt, meta in zip(ids, texts, metas)
+               if id not in existing_ids]
+
+        if not new:
+            continue
+
+        embeddings = embedder.encode([n[1] for n in new]).tolist()
+        pg = get_pg()
+        cur = pg.cursor()
+        for (chunk_id, chunk_text, meta), embedding in zip(new, embeddings):
+            cur.execute("""
+                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
+                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
+                ON CONFLICT (id) DO UPDATE SET
+                    document = EXCLUDED.document,
+                    embedding = EXCLUDED.embedding,
+                    source = EXCLUDED.source,
+                    type = EXCLUDED.type,
+                    created_at = EXCLUDED.created_at,
+                    metadata = EXCLUDED.metadata
+            """, (
+                chunk_id, chunk_text, embedding,
+                meta.get('source'), meta.get('type'), meta.get('created_at'),
+                json_module.dumps(meta)
+            ))
+        pg.commit()
+        pg.close()
+        total += len(new)
+
+    print(f"Conversations: {total} chunks added, {skipped} skipped")
+    return total
+
+def ingest_memories(path):
+    print(f"\nIngesting memories from {path.name}...")
+    raw = json.loads(path.read_text(encoding="utf-8"))
+
+    # Memories are a list of memory objects
+    memories = raw if isinstance(raw, list) else raw.get("memories", [])
+    if not memories:
+        print("No memories found")
+        return 0
+
+    # Combine all memories into one chunk — they're already distilled
+    memory_text = "\n".join([
+        f"- {m.get('content', m) if isinstance(m, dict) else m}"
+        for m in memories
+    ])
+
+    chunk_text = f"[Claude memory — what Claude has learned about Aaron]\n\n{memory_text}"
+    chunk_id = "claude_memories_consolidated"
+
+    existing = collection.get(ids=[chunk_id])
+    if existing["ids"]:
+        # Update by deleting and re-adding
+        collection.delete(ids=[chunk_id])
+
+    embedding = embedder.encode([chunk_text]).tolist()
+    collection.upsert(
+        ids=[chunk_id],
+        documents=[chunk_text],
+        metadatas=[{
+            "source": "Claude: Memory",
+            "type": "claude_memory",
+        }],
+        embeddings=embedding,
+    )
+
+    print(f"Memories: 1 chunk added ({len(memories)} memory items)")
+    return 1
+
+# Run ingestion
+export_dir = Path(EXPORT_DIR)
+total = 0
+
+conv_files = list(export_dir.glob("conversations.*"))
+for f in conv_files:
+    total += ingest_conversations(f)
+
+mem_files = list(export_dir.glob("memories.*"))
+for f in mem_files:
+    total += ingest_memories(f)
+
+if total == 0:
+    print("\nNo files found or no new chunks to add.")
+else:
+    print(f"\nTotal chunks added to corpus: {total}")
+
+# Show updated corpus size
+count = collection.count()
+print(f"Corpus now contains {count} total chunks")
@@ -0,0 +1,91 @@
+"""
+Aaron AI — Migration: pgvector to Graphiti
+One-time migration. Test with limit first: python3 migrate_to_graphiti.py 100
+"""
+import os, sys, json, time, requests, psycopg2
+from pathlib import Path
+from datetime import datetime
+from dotenv import load_dotenv
+
+load_dotenv(Path.home() / "aaronai" / ".env")
+
+GRAPHITI_URL  = "http://localhost:8001"
+PG_DSN        = os.getenv("PG_DSN")
+GROUP_ID      = "aaron"
+BATCH_PAUSE   = 0.5
+PROGRESS_FILE = Path.home() / "aaronai" / "migration_progress.json"
+
+def load_progress():
+    if PROGRESS_FILE.exists():
+        return json.loads(PROGRESS_FILE.read_text())
+    return {"completed_ids": [], "failed_ids": []}
+
+def save_progress(progress):
+    PROGRESS_FILE.write_text(json.dumps(progress, indent=2))
+
+def migrate(limit=None):
+    try:
+        resp = requests.get(f"{GRAPHITI_URL}/health", timeout=5)
+        print(f"Graphiti: {resp.json()}")
+    except Exception as e:
+        print(f"ERROR: sidecar not reachable — {e}"); sys.exit(1)
+
+    progress = load_progress()
+    completed_ids = set(progress["completed_ids"])
+    failed_ids = progress["failed_ids"]
+    if completed_ids:
+        print(f"Resuming — {len(completed_ids)} done, {len(failed_ids)} failed")
+
+    pg = psycopg2.connect(PG_DSN)
+    cur = pg.cursor()
+    query = "SELECT id, document, source, created_at FROM embeddings ORDER BY created_at ASC"
+    if limit:
+        query += f" LIMIT {limit}"
+    cur.execute(query)
+    rows = cur.fetchall()
+    pg.close()
+
+    pending = [r for r in rows if r[0] not in completed_ids]
+    print(f"Total: {len(rows)} | Pending: {len(pending)}{' [TEST]' if limit else ''}\n")
+
+    success = len(completed_ids)
+    failed = len(failed_ids)
+    start = time.time()
+
+    for i, (id, document, source, created_at) in enumerate(pending):
+        try:
+            src = (source or "unknown").replace("/", "-").replace(" ", "-")[:80]
+            name = f"{src}-{id[:8]}"
+            requests.post(f"{GRAPHITI_URL}/episodes", json={
+                "name": name,
+                "content": document,
+                "source_description": source or "nextcloud-corpus",
+                "timestamp": created_at or datetime.now().isoformat(),
+                "group_id": GROUP_ID,
+            }, timeout=120).raise_for_status()
+            success += 1
+            progress["completed_ids"].append(id)
+            if success % 10 == 0:
+                save_progress(progress)
+            if (i + 1) % 50 == 0:
+                elapsed = time.time() - start
+                rate = (i + 1) / elapsed
+                remaining = (len(pending) - i - 1) / rate if rate > 0 else 0
+                print(f"  [{i+1}/{len(pending)}] {success} ok, {failed} failed | ~{remaining/60:.0f} min left")
+            time.sleep(BATCH_PAUSE)
+        except Exception as e:
+            failed += 1
+            progress["failed_ids"].append({"id": id, "error": str(e)})
+            print(f"  FAILED {id}: {e}")
+            save_progress(progress)
+            time.sleep(2)
+
+    save_progress(progress)
+    elapsed = time.time() - start
+    print(f"\nDone — {success} ok, {failed} failed, {elapsed/60:.1f} min")
+    if limit and len(pending) > 0:
+        est = (elapsed / len(pending)) * 12915 / 60
+        print(f"Estimated full run: ~{est:.0f} min")
+
+if __name__ == "__main__":
+    migrate(int(sys.argv[1]) if len(sys.argv) > 1 else None)
@@ -0,0 +1,125 @@
+"""
+Migration: ChromaDB → pgvector
+Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
+Keeps ChromaDB intact as backup until migration is verified.
+"""
+import sqlite3
+import psycopg2
+import json
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+
+CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
+import os
+PG_DSN = os.getenv("PG_DSN")
+if not PG_DSN:
+    raise RuntimeError("PG_DSN environment variable not set")
+
+print("Loading embedding model...")
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+
+print("Connecting to databases...")
+chroma = sqlite3.connect(CHROMA_SQLITE)
+chroma.row_factory = sqlite3.Row
+c = chroma.cursor()
+
+pg = psycopg2.connect(PG_DSN)
+pg_cur = pg.cursor()
+
+# Get all documents with their metadata from ChromaDB
+print("Reading documents from ChromaDB...")
+c.execute("""
+    SELECT 
+        e.id as row_id,
+        e.embedding_id,
+        MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
+        MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
+        MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
+        MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
+    FROM embeddings e
+    LEFT JOIN embedding_metadata em ON e.id = em.id
+    GROUP BY e.id, e.embedding_id
+    HAVING document IS NOT NULL
+    ORDER BY e.id
+""")
+
+rows = c.fetchall()
+print(f"Found {len(rows)} documents to migrate")
+
+# Check existing in PostgreSQL
+pg_cur.execute("SELECT id FROM embeddings")
+existing_ids = set(r[0] for r in pg_cur.fetchall())
+print(f"Already in PostgreSQL: {len(existing_ids)}")
+
+# Filter to only new ones
+to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
+print(f"Need to migrate: {len(to_migrate)}")
+
+if not to_migrate:
+    print("Nothing to migrate — already complete")
+    chroma.close()
+    pg.close()
+    exit(0)
+
+# Migrate in batches
+batch_size = 200
+migrated = 0
+errors = 0
+
+for i in range(0, len(to_migrate), batch_size):
+    batch = to_migrate[i:i+batch_size]
+    
+    # Generate embeddings
+    texts = [r['document'] for r in batch]
+    try:
+        embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
+    except Exception as e:
+        print(f"Embedding error at batch {i}: {e}")
+        errors += len(batch)
+        continue
+    
+    # Insert into PostgreSQL
+    for row, embedding in zip(batch, embeddings):
+        try:
+            pg_cur.execute("""
+                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
+                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
+                ON CONFLICT (id) DO UPDATE SET
+                    document = EXCLUDED.document,
+                    embedding = EXCLUDED.embedding,
+                    source = EXCLUDED.source,
+                    type = EXCLUDED.type,
+                    created_at = EXCLUDED.created_at,
+                    metadata = EXCLUDED.metadata
+            """, (
+                row['embedding_id'],
+                row['document'],
+                embedding,
+                row['source'],
+                row['type'],
+                row['created_at'],
+                json.dumps({
+                    'source': row['source'],
+                    'type': row['type'],
+                    'created_at': row['created_at'],
+                })
+            ))
+            migrated += 1
+        except Exception as e:
+            print(f"Insert error for {row['embedding_id']}: {e}")
+            errors += 1
+    
+    pg.commit()
+    print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
+
+# Final count
+pg_cur.execute("SELECT COUNT(*) FROM embeddings")
+final_count = pg_cur.fetchone()[0]
+
+chroma.close()
+pg.close()
+
+print(f"\nMigration complete:")
+print(f"  Migrated: {migrated}")
+print(f"  Errors: {errors}")
+print(f"  PostgreSQL total: {final_count}")