Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, dream.py

This commit is contained in:
2026-04-26 21:16:04 +00:00
parent d2eed98906
commit f78b83042b
6 changed files with 250 additions and 83 deletions
+25 -13
View File
@@ -3,7 +3,9 @@ import sys
import hashlib
from pathlib import Path
from dotenv import load_dotenv
import chromadb
import psycopg2
import psycopg2.extras
import json
from sentence_transformers import SentenceTransformer
from docx import Document
from pypdf import PdfReader
@@ -14,12 +16,10 @@ load_dotenv(Path.home() / "aaronai" / ".env")
print("Loading embedding model...")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
db_path = str(Path.home() / "aaronai" / "db")
client = chromadb.PersistentClient(path=db_path)
collection = client.get_or_create_collection(
name="aaronai",
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
)
PG_DSN = os.getenv("PG_DSN", "dbname=aaronai user=aaronai password=aaronai_db_password host=localhost")
def get_pg():
return psycopg2.connect(PG_DSN)
def extract_text_from_docx(path):
doc = Document(path)
@@ -98,12 +98,24 @@ def ingest_file(filepath):
"folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
} for _ in chunks]
collection.upsert(
documents=chunks,
embeddings=embeddings,
ids=ids,
metadatas=metadatas
)
pg = get_pg()
cur = pg.cursor()
for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
cur.execute("""
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
ON CONFLICT (id) DO UPDATE SET
document = EXCLUDED.document,
embedding = EXCLUDED.embedding,
source = EXCLUDED.source,
metadata = EXCLUDED.metadata
""", (
chunk_id, chunk, embedding,
meta.get('source'), 'document', None,
json.dumps(meta)
))
pg.commit()
pg.close()
print(f" Indexed {len(chunks)} chunks: {path.name}")
return len(chunks)