diff --git a/scripts/dream.py b/scripts/dream.py index ed8da8d..0c197b6 100644 --- a/scripts/dream.py +++ b/scripts/dream.py @@ -104,7 +104,7 @@ def get_recent_conversation_topics(days=14): # ─── Stage 2: Retrieve ────────────────────────────────────────────────────── -def retrieve(mode, task=None, n_results=8): +def retrieve(mode, task=None, n_results=8, excluded_sources=None): from sentence_transformers import SentenceTransformer embedder = SentenceTransformer("all-MiniLM-L6-v2") low, high = MODE_RANGES[mode] @@ -127,12 +127,22 @@ def retrieve(mode, task=None, n_results=8): try: pg = get_pg() cur = pg.cursor() - cur.execute(""" - SELECT document, source, 1 - (embedding <=> %s::vector) as similarity - FROM embeddings - ORDER BY embedding <=> %s::vector - LIMIT %s - """, (embedding, embedding, n_results * 3)) + excluded_sources = excluded_sources or set() + if excluded_sources: + cur.execute(""" + SELECT document, source, 1 - (embedding <=> %s::vector) as similarity + FROM embeddings + WHERE source NOT IN %s + ORDER BY embedding <=> %s::vector + LIMIT %s + """, (embedding, tuple(excluded_sources), embedding, n_results * 3)) + else: + cur.execute(""" + SELECT document, source, 1 - (embedding <=> %s::vector) as similarity + FROM embeddings + ORDER BY embedding <=> %s::vector + LIMIT %s + """, (embedding, embedding, n_results * 3)) for doc, source, similarity in cur.fetchall(): if not (low <= similarity <= high): @@ -379,12 +389,18 @@ def dream_pipeline(): """ print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}") + state = load_dreamer_state() + previously_retrieved = set(state.get("retrieved_sources", [])) + session_retrieved = set() + delta = observe_corpus() print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream") + print(f"Excluding {len(previously_retrieved)} previously retrieved sources") # ── Stage 1: NREM ────────────────────────────────────────────────────── print("\n[NREM] Retrieving...") - nrem_chunks = retrieve("nrem") + nrem_chunks = retrieve("nrem", excluded_sources=previously_retrieved | session_retrieved) + session_retrieved.update(c["source"] for c in nrem_chunks) if not nrem_chunks: print("[NREM] No suitable chunks — aborting pipeline") return None @@ -405,7 +421,8 @@ def dream_pipeline(): # ── Stage 2: Early REM — informed by NREM ────────────────────────────── print("\n[Early REM] Retrieving...") - early_chunks = retrieve("early-rem") + early_chunks = retrieve("early-rem", excluded_sources=previously_retrieved | session_retrieved) + session_retrieved.update(c["source"] for c in early_chunks) if not early_chunks: print("[Early REM] No suitable chunks — skipping") early_rem_output = nrem_output # fallback @@ -424,7 +441,8 @@ def dream_pipeline(): # ── Stage 3: Late REM — informed by NREM + Early REM ────────────────── print("\n[Late REM] Retrieving...") - late_chunks = retrieve("late-rem") + late_chunks = retrieve("late-rem", excluded_sources=previously_retrieved | session_retrieved) + session_retrieved.update(c["source"] for c in late_chunks) if not late_chunks: print("[Late REM] No suitable chunks — skipping") late_rem_output = early_rem_output # fallback @@ -468,6 +486,13 @@ def dream_pipeline(): state["last_dream_timestamp"] = datetime.now().timestamp() state["last_dream_mode"] = "pipeline" state["last_dream_file"] = synthesis_file + + # Accumulate retrieved sources across nights. Cap at 500, trim to 400 on overflow. + all_retrieved = list(previously_retrieved | session_retrieved) + if len(all_retrieved) > 500: + all_retrieved = all_retrieved[-400:] + state["retrieved_sources"] = all_retrieved + save_dreamer_state(state) notify_sse("synthesis", synthesis_file.split("/")[-1])