From e5898f3019d000dd2854c5da01e0a0db48cdb28f Mon Sep 17 00:00:00 2001 From: Aaron Nelson Date: Sun, 3 May 2026 20:32:15 +0000 Subject: [PATCH] dream.py: replace cumulative cross-night exclusion with session-scoped novelty (Track 1 Finding 1) The cumulative `retrieved_sources` list (capped at 500, trimmed to 400 on overflow) was hiding ~40% of the corpus from Early REM and Late REM after the cap filled. The architecture and reframe both specify session-scoped novelty, not corpus-lifetime exclusion. Same NREM-shape divergence as the 2026-05-02 NREM exclusion fix. Changes: - Drop `previously_retrieved` load; pop the legacy `retrieved_sources` key from `dreamer_state.json` at pipeline start. - Early REM excludes only the current session's NREM high-scorers. - Late REM excludes only the current session's NREM \u222a Early REM. - Remove the across-night accumulation block at the end of the pipeline; reuse the in-scope state object for the post-pipeline metadata write (eliminates a redundant disk re-read that was reintroducing the legacy key). NREM exclusion fix from 2026-05-02 preserved (`nrem_chunks = retrieve("nrem", excluded_sources=None)`). Verification: post-fix dream-manifest source count rose to 24 (NREM 8 + Early REM 8 + Late REM 8) vs. 13 / 16 on the two prior comparable runs. Legacy key absent from `dreamer_state.json` post-run. --- scripts/dream.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/scripts/dream.py b/scripts/dream.py index 612b1ab..373a3f3 100644 --- a/scripts/dream.py +++ b/scripts/dream.py @@ -490,12 +490,12 @@ def dream_pipeline(): print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}") state = load_dreamer_state() - previously_retrieved = set(state.get("retrieved_sources", [])) + state.pop("retrieved_sources", None) # legacy key; session-scoped novelty now session_retrieved = set() delta = observe_corpus() print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream") - print(f"Excluding {len(previously_retrieved)} previously retrieved sources") + print("Novelty: session-scoped (no across-night exclusion)") # ── Stage 1: NREM ────────────────────────────────────────────────────── print("\n[NREM] Retrieving...") @@ -532,7 +532,7 @@ def dream_pipeline(): print("\n[Early REM] Retrieving...") # Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved) # Sources that scored in Early REM band during NREM remain available - early_chunks = retrieve("early-rem", excluded_sources=previously_retrieved | nrem_high_sources) + early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources) session_retrieved.update(c["source"] for c in early_chunks) if not early_chunks: print("[Early REM] No suitable chunks — skipping") @@ -557,7 +557,7 @@ def dream_pipeline(): # ── Stage 3: Late REM — informed by NREM + Early REM ────────────────── print("\n[Late REM] Retrieving...") - late_chunks = retrieve("late-rem", excluded_sources=previously_retrieved | session_retrieved) + late_chunks = retrieve("late-rem", excluded_sources=session_retrieved) session_retrieved.update(c["source"] for c in late_chunks) if not late_chunks: print("[Late REM] No suitable chunks — skipping") @@ -616,18 +616,11 @@ def dream_pipeline(): } write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data) - # Update state and notify - state = load_dreamer_state() + # Update state and notify (reuse state from start of pipeline; legacy key already popped) state["last_dream_timestamp"] = datetime.now().timestamp() state["last_dream_mode"] = "pipeline" state["last_dream_file"] = synthesis_file - # Accumulate retrieved sources across nights. Cap at 500, trim to 400 on overflow. - all_retrieved = list(previously_retrieved | session_retrieved) - if len(all_retrieved) > 500: - all_retrieved = all_retrieved[-400:] - state["retrieved_sources"] = all_retrieved - save_dreamer_state(state) notify_sse("synthesis", synthesis_file.split("/")[-1])