chore: archive deprecated chromadb and migration scripts
This commit is contained in:
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Aaron AI — Migration: pgvector to Graphiti
|
||||
One-time migration. Test with limit first: python3 migrate_to_graphiti.py 100
|
||||
"""
|
||||
import os, sys, json, time, requests, psycopg2
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path.home() / "aaronai" / ".env")
|
||||
|
||||
GRAPHITI_URL = "http://localhost:8001"
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
GROUP_ID = "aaron"
|
||||
BATCH_PAUSE = 0.5
|
||||
PROGRESS_FILE = Path.home() / "aaronai" / "migration_progress.json"
|
||||
|
||||
def load_progress():
|
||||
if PROGRESS_FILE.exists():
|
||||
return json.loads(PROGRESS_FILE.read_text())
|
||||
return {"completed_ids": [], "failed_ids": []}
|
||||
|
||||
def save_progress(progress):
|
||||
PROGRESS_FILE.write_text(json.dumps(progress, indent=2))
|
||||
|
||||
def migrate(limit=None):
|
||||
try:
|
||||
resp = requests.get(f"{GRAPHITI_URL}/health", timeout=5)
|
||||
print(f"Graphiti: {resp.json()}")
|
||||
except Exception as e:
|
||||
print(f"ERROR: sidecar not reachable — {e}"); sys.exit(1)
|
||||
|
||||
progress = load_progress()
|
||||
completed_ids = set(progress["completed_ids"])
|
||||
failed_ids = progress["failed_ids"]
|
||||
if completed_ids:
|
||||
print(f"Resuming — {len(completed_ids)} done, {len(failed_ids)} failed")
|
||||
|
||||
pg = psycopg2.connect(PG_DSN)
|
||||
cur = pg.cursor()
|
||||
query = "SELECT id, document, source, created_at FROM embeddings ORDER BY created_at ASC"
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
cur.execute(query)
|
||||
rows = cur.fetchall()
|
||||
pg.close()
|
||||
|
||||
pending = [r for r in rows if r[0] not in completed_ids]
|
||||
print(f"Total: {len(rows)} | Pending: {len(pending)}{' [TEST]' if limit else ''}\n")
|
||||
|
||||
success = len(completed_ids)
|
||||
failed = len(failed_ids)
|
||||
start = time.time()
|
||||
|
||||
for i, (id, document, source, created_at) in enumerate(pending):
|
||||
try:
|
||||
src = (source or "unknown").replace("/", "-").replace(" ", "-")[:80]
|
||||
name = f"{src}-{id[:8]}"
|
||||
requests.post(f"{GRAPHITI_URL}/episodes", json={
|
||||
"name": name,
|
||||
"content": document,
|
||||
"source_description": source or "nextcloud-corpus",
|
||||
"timestamp": created_at or datetime.now().isoformat(),
|
||||
"group_id": GROUP_ID,
|
||||
}, timeout=120).raise_for_status()
|
||||
success += 1
|
||||
progress["completed_ids"].append(id)
|
||||
if success % 10 == 0:
|
||||
save_progress(progress)
|
||||
if (i + 1) % 50 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = (i + 1) / elapsed
|
||||
remaining = (len(pending) - i - 1) / rate if rate > 0 else 0
|
||||
print(f" [{i+1}/{len(pending)}] {success} ok, {failed} failed | ~{remaining/60:.0f} min left")
|
||||
time.sleep(BATCH_PAUSE)
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
progress["failed_ids"].append({"id": id, "error": str(e)})
|
||||
print(f" FAILED {id}: {e}")
|
||||
save_progress(progress)
|
||||
time.sleep(2)
|
||||
|
||||
save_progress(progress)
|
||||
elapsed = time.time() - start
|
||||
print(f"\nDone — {success} ok, {failed} failed, {elapsed/60:.1f} min")
|
||||
if limit and len(pending) > 0:
|
||||
est = (elapsed / len(pending)) * 12915 / 60
|
||||
print(f"Estimated full run: ~{est:.0f} min")
|
||||
|
||||
if __name__ == "__main__":
|
||||
migrate(int(sys.argv[1]) if len(sys.argv) > 1 else None)
|
||||
Reference in New Issue
Block a user