Migrate to pgvector — remove ChromaDB from api.py, ingest scripts, dream.py
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Migration: ChromaDB → pgvector
|
||||
Re-embeds all documents from ChromaDB SQLite into PostgreSQL with pgvector.
|
||||
Keeps ChromaDB intact as backup until migration is verified.
|
||||
"""
|
||||
import sqlite3
|
||||
import psycopg2
|
||||
import json
|
||||
from pathlib import Path
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
CHROMA_SQLITE = str(Path.home() / "aaronai" / "db" / "chroma.sqlite3")
|
||||
PG_DSN = "dbname=aaronai user=aaronai password=aaronai_db_password host=localhost"
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
print("Connecting to databases...")
|
||||
chroma = sqlite3.connect(CHROMA_SQLITE)
|
||||
chroma.row_factory = sqlite3.Row
|
||||
c = chroma.cursor()
|
||||
|
||||
pg = psycopg2.connect(PG_DSN)
|
||||
pg_cur = pg.cursor()
|
||||
|
||||
# Get all documents with their metadata from ChromaDB
|
||||
print("Reading documents from ChromaDB...")
|
||||
c.execute("""
|
||||
SELECT
|
||||
e.id as row_id,
|
||||
e.embedding_id,
|
||||
MAX(CASE WHEN em.key = 'chroma:document' THEN em.string_value END) as document,
|
||||
MAX(CASE WHEN em.key = 'source' THEN em.string_value END) as source,
|
||||
MAX(CASE WHEN em.key = 'type' THEN em.string_value END) as type,
|
||||
MAX(CASE WHEN em.key = 'created_at' THEN em.string_value END) as created_at
|
||||
FROM embeddings e
|
||||
LEFT JOIN embedding_metadata em ON e.id = em.id
|
||||
GROUP BY e.id, e.embedding_id
|
||||
HAVING document IS NOT NULL
|
||||
ORDER BY e.id
|
||||
""")
|
||||
|
||||
rows = c.fetchall()
|
||||
print(f"Found {len(rows)} documents to migrate")
|
||||
|
||||
# Check existing in PostgreSQL
|
||||
pg_cur.execute("SELECT id FROM embeddings")
|
||||
existing_ids = set(r[0] for r in pg_cur.fetchall())
|
||||
print(f"Already in PostgreSQL: {len(existing_ids)}")
|
||||
|
||||
# Filter to only new ones
|
||||
to_migrate = [r for r in rows if r['embedding_id'] not in existing_ids]
|
||||
print(f"Need to migrate: {len(to_migrate)}")
|
||||
|
||||
if not to_migrate:
|
||||
print("Nothing to migrate — already complete")
|
||||
chroma.close()
|
||||
pg.close()
|
||||
exit(0)
|
||||
|
||||
# Migrate in batches
|
||||
batch_size = 200
|
||||
migrated = 0
|
||||
errors = 0
|
||||
|
||||
for i in range(0, len(to_migrate), batch_size):
|
||||
batch = to_migrate[i:i+batch_size]
|
||||
|
||||
# Generate embeddings
|
||||
texts = [r['document'] for r in batch]
|
||||
try:
|
||||
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
|
||||
except Exception as e:
|
||||
print(f"Embedding error at batch {i}: {e}")
|
||||
errors += len(batch)
|
||||
continue
|
||||
|
||||
# Insert into PostgreSQL
|
||||
for row, embedding in zip(batch, embeddings):
|
||||
try:
|
||||
pg_cur.execute("""
|
||||
INSERT INTO embeddings (id, document, embedding, source, type, created_at, metadata)
|
||||
VALUES (%s, %s, %s::vector, %s, %s, %s, %s)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
document = EXCLUDED.document,
|
||||
embedding = EXCLUDED.embedding,
|
||||
source = EXCLUDED.source,
|
||||
type = EXCLUDED.type,
|
||||
created_at = EXCLUDED.created_at,
|
||||
metadata = EXCLUDED.metadata
|
||||
""", (
|
||||
row['embedding_id'],
|
||||
row['document'],
|
||||
embedding,
|
||||
row['source'],
|
||||
row['type'],
|
||||
row['created_at'],
|
||||
json.dumps({
|
||||
'source': row['source'],
|
||||
'type': row['type'],
|
||||
'created_at': row['created_at'],
|
||||
})
|
||||
))
|
||||
migrated += 1
|
||||
except Exception as e:
|
||||
print(f"Insert error for {row['embedding_id']}: {e}")
|
||||
errors += 1
|
||||
|
||||
pg.commit()
|
||||
print(f"Progress: {min(i+batch_size, len(to_migrate))}/{len(to_migrate)} ({errors} errors)")
|
||||
|
||||
# Final count
|
||||
pg_cur.execute("SELECT COUNT(*) FROM embeddings")
|
||||
final_count = pg_cur.fetchone()[0]
|
||||
|
||||
chroma.close()
|
||||
pg.close()
|
||||
|
||||
print(f"\nMigration complete:")
|
||||
print(f" Migrated: {migrated}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" PostgreSQL total: {final_count}")
|
||||
Reference in New Issue
Block a user