dream.py: replace cumulative cross-night exclusion with session-scoped novelty (Track 1 Finding 1)
The cumulative `retrieved_sources` list (capped at 500, trimmed to 400 on
overflow) was hiding ~40% of the corpus from Early REM and Late REM after the
cap filled. The architecture and reframe both specify session-scoped novelty,
not corpus-lifetime exclusion. Same NREM-shape divergence as the 2026-05-02
NREM exclusion fix.
Changes:
- Drop `previously_retrieved` load; pop the legacy `retrieved_sources` key
from `dreamer_state.json` at pipeline start.
- Early REM excludes only the current session's NREM high-scorers.
- Late REM excludes only the current session's NREM \u222a Early REM.
- Remove the across-night accumulation block at the end of the pipeline; reuse
the in-scope state object for the post-pipeline metadata write (eliminates a
redundant disk re-read that was reintroducing the legacy key).
NREM exclusion fix from 2026-05-02 preserved (`nrem_chunks = retrieve("nrem",
excluded_sources=None)`).
Verification: post-fix dream-manifest source count rose to 24 (NREM 8 + Early
REM 8 + Late REM 8) vs. 13 / 16 on the two prior comparable runs. Legacy key
absent from `dreamer_state.json` post-run.
This commit is contained in:
+5
-12
@@ -490,12 +490,12 @@ def dream_pipeline():
|
|||||||
print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||||||
|
|
||||||
state = load_dreamer_state()
|
state = load_dreamer_state()
|
||||||
previously_retrieved = set(state.get("retrieved_sources", []))
|
state.pop("retrieved_sources", None) # legacy key; session-scoped novelty now
|
||||||
session_retrieved = set()
|
session_retrieved = set()
|
||||||
|
|
||||||
delta = observe_corpus()
|
delta = observe_corpus()
|
||||||
print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
|
print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
|
||||||
print(f"Excluding {len(previously_retrieved)} previously retrieved sources")
|
print("Novelty: session-scoped (no across-night exclusion)")
|
||||||
|
|
||||||
# ── Stage 1: NREM ──────────────────────────────────────────────────────
|
# ── Stage 1: NREM ──────────────────────────────────────────────────────
|
||||||
print("\n[NREM] Retrieving...")
|
print("\n[NREM] Retrieving...")
|
||||||
@@ -532,7 +532,7 @@ def dream_pipeline():
|
|||||||
print("\n[Early REM] Retrieving...")
|
print("\n[Early REM] Retrieving...")
|
||||||
# Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
|
# Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
|
||||||
# Sources that scored in Early REM band during NREM remain available
|
# Sources that scored in Early REM band during NREM remain available
|
||||||
early_chunks = retrieve("early-rem", excluded_sources=previously_retrieved | nrem_high_sources)
|
early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources)
|
||||||
session_retrieved.update(c["source"] for c in early_chunks)
|
session_retrieved.update(c["source"] for c in early_chunks)
|
||||||
if not early_chunks:
|
if not early_chunks:
|
||||||
print("[Early REM] No suitable chunks — skipping")
|
print("[Early REM] No suitable chunks — skipping")
|
||||||
@@ -557,7 +557,7 @@ def dream_pipeline():
|
|||||||
|
|
||||||
# ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
|
# ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
|
||||||
print("\n[Late REM] Retrieving...")
|
print("\n[Late REM] Retrieving...")
|
||||||
late_chunks = retrieve("late-rem", excluded_sources=previously_retrieved | session_retrieved)
|
late_chunks = retrieve("late-rem", excluded_sources=session_retrieved)
|
||||||
session_retrieved.update(c["source"] for c in late_chunks)
|
session_retrieved.update(c["source"] for c in late_chunks)
|
||||||
if not late_chunks:
|
if not late_chunks:
|
||||||
print("[Late REM] No suitable chunks — skipping")
|
print("[Late REM] No suitable chunks — skipping")
|
||||||
@@ -616,18 +616,11 @@ def dream_pipeline():
|
|||||||
}
|
}
|
||||||
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
|
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
|
||||||
|
|
||||||
# Update state and notify
|
# Update state and notify (reuse state from start of pipeline; legacy key already popped)
|
||||||
state = load_dreamer_state()
|
|
||||||
state["last_dream_timestamp"] = datetime.now().timestamp()
|
state["last_dream_timestamp"] = datetime.now().timestamp()
|
||||||
state["last_dream_mode"] = "pipeline"
|
state["last_dream_mode"] = "pipeline"
|
||||||
state["last_dream_file"] = synthesis_file
|
state["last_dream_file"] = synthesis_file
|
||||||
|
|
||||||
# Accumulate retrieved sources across nights. Cap at 500, trim to 400 on overflow.
|
|
||||||
all_retrieved = list(previously_retrieved | session_retrieved)
|
|
||||||
if len(all_retrieved) > 500:
|
|
||||||
all_retrieved = all_retrieved[-400:]
|
|
||||||
state["retrieved_sources"] = all_retrieved
|
|
||||||
|
|
||||||
save_dreamer_state(state)
|
save_dreamer_state(state)
|
||||||
|
|
||||||
notify_sse("synthesis", synthesis_file.split("/")[-1])
|
notify_sse("synthesis", synthesis_file.split("/")[-1])
|
||||||
|
|||||||
Reference in New Issue
Block a user