ingest_conversations.py: lazy-load embedder to match ingest.py pattern
Embedder was instantiated at module import (~30-60s, ~200MB) regardless of whether new conversations existed. On nights with no new content (most nights per the logs), the script paid the load cost and exited immediately. ingest.py:134 already uses lazy loading; this brings the two ingest scripts into a consistent shape.
This commit is contained in:
@@ -18,8 +18,14 @@ CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
|
|||||||
PG_DSN = os.getenv("PG_DSN")
|
PG_DSN = os.getenv("PG_DSN")
|
||||||
MIN_EXCHANGES = 3
|
MIN_EXCHANGES = 3
|
||||||
|
|
||||||
print("Loading embedding model...")
|
_embedder = None
|
||||||
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
|
||||||
|
def get_embedder():
|
||||||
|
global _embedder
|
||||||
|
if _embedder is None:
|
||||||
|
print("Loading embedding model...")
|
||||||
|
_embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
||||||
|
return _embedder
|
||||||
|
|
||||||
def get_conversations():
|
def get_conversations():
|
||||||
conn = sqlite3.connect(CONVERSATIONS_DB)
|
conn = sqlite3.connect(CONVERSATIONS_DB)
|
||||||
@@ -123,7 +129,7 @@ def run():
|
|||||||
|
|
||||||
# Embed and insert
|
# Embed and insert
|
||||||
texts = [c[1] for c in new_chunks]
|
texts = [c[1] for c in new_chunks]
|
||||||
embeddings = embedder.encode(texts, show_progress_bar=False).tolist()
|
embeddings = get_embedder().encode(texts, show_progress_bar=False).tolist()
|
||||||
|
|
||||||
for (chunk_id, chunk_text, meta), embedding in zip(new_chunks, embeddings):
|
for (chunk_id, chunk_text, meta), embedding in zip(new_chunks, embeddings):
|
||||||
if not meta.get("type"):
|
if not meta.get("type"):
|
||||||
|
|||||||
Reference in New Issue
Block a user