Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, dream.py
This commit is contained in:
+25
-13
@@ -3,7 +3,9 @@ import sys
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import chromadb
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import json
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from docx import Document
|
||||
from pypdf import PdfReader
|
||||
@@ -14,12 +16,10 @@ load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
db_path = str(Path.home() / "aaronai" / "db")
|
||||
client = chromadb.PersistentClient(path=db_path)
|
||||
collection = client.get_or_create_collection(
|
||||
name="aaronai",
|
||||
metadata={"hnsw:space": "cosine", "hnsw:allow_replace_deleted": True}
|
||||
)
|
||||
PG_DSN = os.getenv("PG_DSN", "dbname=aaronai user=aaronai password=aaronai_db_password host=localhost")
|
||||
|
||||
def get_pg():
|
||||
return psycopg2.connect(PG_DSN)
|
||||
|
||||
def extract_text_from_docx(path):
|
||||
doc = Document(path)
|
||||
@@ -98,12 +98,24 @@ def ingest_file(filepath):
|
||||
"folder": str(path.parent.relative_to(Path(sys.argv[1]) if len(sys.argv) > 1 else path.parent))
|
||||
} for _ in chunks]
|
||||
|
||||
collection.upsert(
|
||||
documents=chunks,
|
||||
embeddings=embeddings,
|
||||
ids=ids,
|
||||
metadatas=metadatas
|
||||
)
|
||||
pg = get_pg()
|
||||
cur = pg.cursor()
|
||||
for chunk_id, chunk, embedding, meta in zip(ids, chunks, embeddings, metadatas):
|
||||
cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
chunk_id, chunk, embedding,
|
||||
meta.get('source'), 'document', None,
|
||||
json.dumps(meta)
|
||||
))
|
||||
pg.commit()
|
||||
pg.close()
|
||||
print(f" Indexed {len(chunks)} chunks: {path.name}")
|
||||
return len(chunks)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user