ingest_conversations.py: lazy-load embedder to match ingest.py pattern
Embedder was instantiated at module import (~30-60s, ~200MB) regardless of whether new conversations existed. On nights with no new content (most nights per the logs), the script paid the load cost and exited immediately. ingest.py:134 already uses lazy loading; this brings the two ingest scripts into a consistent shape.
This commit is contained in:
@@ -18,8 +18,14 @@ CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
|
||||
PG_DSN = os.getenv("PG_DSN")
|
||||
MIN_EXCHANGES = 3
|
||||
|
||||
print("Loading embedding model...")
|
||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
_embedder = None
|
||||
|
||||
def get_embedder():
|
||||
global _embedder
|
||||
if _embedder is None:
|
||||
print("Loading embedding model...")
|
||||
_embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
return _embedder
|
||||
|
||||
def get_conversations():
|
||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||
@@ -123,7 +129,7 @@ def run():
|
||||
|
||||
# Embed and insert
|
||||
texts = [c[1] for c in new_chunks]
|
||||
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
|
||||
embeddings = get_embedder().encode(texts, show_progress_bar=False).tolist()
|
||||
|
||||
for (chunk_id, chunk_text, meta), embedding in zip(new_chunks, embeddings):
|
||||
if not meta.get("type"):
|
||||
|
||||
Reference in New Issue
Block a user