Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, dream.py

2026-04-26 21:16:04 +00:00
parent d2eed98906
commit f78b83042b
6 changed files with 250 additions and 83 deletions
@@ -3,7 +3,9 @@ import sys
 import hashlib
 from pathlib import Path
 from dotenv import load_dotenv
-import chromadb
+import psycopg2
+import psycopg2.extras
+import json
 from sentence_transformers import SentenceTransformer
 from docx import Document
 from pypdf import PdfReader
@@ -14,12 +16,10 @@ load_dotenv(Path.home() / "aaronai" / ".env")
 print("Loading embedding model...")
 embedder = SentenceTransformer("all-MiniLM-L6-v2")

-db_path = str(Path.home() / "aaronai" / "db")
-client = chromadb.PersistentClient(path=db_path)
-collection = client.get_or_create_collection(
-    name="aaronai",
-    metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
-)
+PG_DSN = os.getenv("PG_DSN", "dbname=aaronai user=aaronai password=aaronai_db_password host=localhost")
+
+def get_pg():
+    return psycopg2.connect(PG_DSN)

 def extract_text_from_docx(path):
    doc = Document(path)
@@ -98,12 +98,24 @@ def ingest_file(filepath):
            "folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
        } for _ in chunks]

-        collection.upsert(
-            documents=chunks,
-            embeddings=embeddings,
-            ids=ids,
-            metadatas=metadatas
-        )
+        pg = get_pg()
+        cur = pg.cursor()
+        for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
+            cur.execute("""
+                INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
+                VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
+                ON CONFLICT (id) DO UPDATE SET
+                    document = EXCLUDED.document,
+                    embedding = EXCLUDED.embedding,
+                    source = EXCLUDED.source,
+                    metadata = EXCLUDED.metadata
+            """, (
+                chunk_id, chunk, embedding,
+                meta.get('source'), 'document', None,
+                json.dumps(meta)
+            ))
+        pg.commit()
+        pg.close()
        print(f"  Indexed {len(chunks)} chunks: {path.name}")
        return len(chunks)