From c5fc517fef23358f37454a96803c09a551cd9000 Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Mon, 4 May 2026 03:13:45 +0000 Subject: [PATCH] ingest_conversations.py: lazy-load embedder to match ingest.py pattern Embedder was instantiated at module import (~30-60s, ~200MB) regardless of whether new conversations existed. On nights with no new content (most nights per the logs), the script paid the load cost and exited immediately. ingest.py:134 already uses lazy loading; this brings the two ingest scripts into a consistent shape. --- scripts/ingest_conversations.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/ingest_conversations.py b/scripts/ingest_conversations.py index 3ad7100..694bd57 100644 --- a/scripts/ingest_conversations.py +++ b/scripts/ingest_conversations.py @@ -18,8 +18,14 @@ CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db") PG_DSN = os.getenv("PG_DSN") MIN_EXCHANGES = 3 -print("Loading embedding model...") -embedder = SentenceTransformer("all-MiniLM-L6-v2") +_embedder = None + +def get_embedder(): + global _embedder + if _embedder is None: + print("Loading embedding model...") + _embedder = SentenceTransformer("all-MiniLM-L6-v2") + return _embedder def get_conversations(): conn = sqlite3.connect(CONVERSATIONS_DB) @@ -123,7 +129,7 @@ def run(): # Embed and insert texts = [c[1] for c in new_chunks] - embeddings = embedder.encode(texts, show_progress_bar=False).tolist() + embeddings = get_embedder().encode(texts, show_progress_bar=False).tolist() for (chunk_id, chunk_text, meta), embedding in zip(new_chunks, embeddings): if not meta.get("type"):