dream.py: Stage 3+ refactor — LLM-generated queries, MMR, mutable windows, consolidation cursor
Implements the rest of dreamer-design-spec.md's Stage 3 alongside the prescriptions from the external literature review: - Hardcoded seed query strings are gone. _llm_generate_queries() produces 4 mode-appropriate retrieval queries per call from the observation signal (Park et al. 2023 reflection pattern). NREM queries probe RECENT additions; Early REM bridges associative/emotional threads; Late REM forces cross- domain pairs; Lucid decomposes the task. Empirical first-run output: queries like "SUNY New Paltz Fall 2026 registration moratorium" instead of the fixed "research fabrication teaching practice recent work" — vector neighborhood now drifts with what the user has been actually doing. - TIME_WINDOWS_HOURS makes per-mode retrieval windows mutable (dreamer-multimodal-design.md §2's tech-debt item): NREM 72hr / Early REM 30d / Late REM 90d / Lucid no-window. NULL created_at rows are excluded from windowed modes — correct since they predate the cursor by definition. - NREM bias toward under-processed chunks via "ORDER BY consolidation_count ASC" before vector distance. Biologically motivated: sharp-wave-ripple replay is tagged/biased, not uniform. Chunks that haven't been replayed recently win the tiebreak. - MMR merge (Carbonell & Goldstein 1998) over the union of all queries' candidates. λ=0.5. Directly attacks the cluster-dominance failure mode where 8 dossier-narrative variants filled all 8 slots in 5 consecutive nights. - _bump_consolidation_cursor() called after NREM completes. Each source used gets consolidation_count += 1 and last_consolidated_at = NOW(). Tomorrow's signal sees these as more-processed, less under-processed. - dream_pipeline now runs observe_corpus + select_mode at the top per spec lines 27-34. If select_mode returns None — corpus unchanged + no new journal entry — pipeline exits with no dream rather than manufacturing novelty (spec line 67's "dreamer goes quiet"). Back-compat preserved: - retrieve()'s signature gains `signal` as optional kwarg; default behavior calls observe_corpus() inline so dream_single / dream_lucid keep working unchanged. - Graphiti substrate (E3 experiment) path untouched. - Manifest schema keeps the "query" field; value is now "[llm-generated from observation signal]" so historical manifest consumers don't break.
This commit is contained in:
+327
-49
@@ -23,6 +23,7 @@ from datetime import datetime, timedelta
|
|||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import psycopg2
|
import psycopg2
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
||||||
|
|
||||||
@@ -42,6 +43,26 @@ NEXTCLOUD_USER = os.getenv("NEXTCLOUD_USER", "aaron")
|
|||||||
NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "")
|
NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "")
|
||||||
DREAMS_WEBDAV = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams"
|
DREAMS_WEBDAV = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams"
|
||||||
|
|
||||||
|
# ─── Retrieval-window config (per dreamer-multimodal-design.md §2) ─────────
|
||||||
|
# Biological grounding: NREM replays recent traces (24-72 hrs); REM links
|
||||||
|
# across time on structural similarity, not temporal proximity. Synthesis
|
||||||
|
# pulls from salience across the full corpus (no window). Spec calls for
|
||||||
|
# these to be mutable rather than hardcoded — this is the mutable home.
|
||||||
|
TIME_WINDOWS_HOURS = {
|
||||||
|
"nrem": 72, # 24-72 hrs, take wider end
|
||||||
|
"early-rem": 24 * 30, # 30 days
|
||||||
|
"late-rem": 24 * 90, # 90 days
|
||||||
|
"lucid": None, # no window
|
||||||
|
}
|
||||||
|
|
||||||
|
# Maximal Marginal Relevance: λ=1 → pure relevance, λ=0 → pure diversity.
|
||||||
|
# 0.5 is the standard balance; tune later if the dossier-cluster problem
|
||||||
|
# isn't sufficiently broken up.
|
||||||
|
MMR_LAMBDA = 0.5
|
||||||
|
|
||||||
|
# Fast/cheap model for query generation. Sonnet for synthesis (in synthesize_*).
|
||||||
|
LLM_QUERY_MODEL = os.getenv("DREAMER_QUERY_MODEL", "claude-haiku-4-5-20251001")
|
||||||
|
|
||||||
# Similarity ranges calibrated for all-MiniLM-L6-v2
|
# Similarity ranges calibrated for all-MiniLM-L6-v2
|
||||||
MODE_RANGES = {
|
MODE_RANGES = {
|
||||||
"nrem": (0.48, 0.72),
|
"nrem": (0.48, 0.72),
|
||||||
@@ -289,36 +310,207 @@ def _get_embedder():
|
|||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
return SentenceTransformer("all-MiniLM-L6-v2")
|
return SentenceTransformer("all-MiniLM-L6-v2")
|
||||||
|
|
||||||
def retrieve(mode, task=None, n_results=8, excluded_sources=None, type_filter=None):
|
def _llm_generate_queries(mode, signal, task=None, n_queries=4):
|
||||||
# E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search
|
"""Park et al. 2023 reflection-style query generation. Feeds the LLM the
|
||||||
# Default behavior: pgvector similarity search (unchanged)
|
observation signal + a mode-specific framing; emits N retrieval queries
|
||||||
# type_filter is experimental and applies to pgvector retrieval only — Graphiti
|
that probe different corners of the recent corpus instead of the same
|
||||||
# facts are not embeddings rows and have no embeddings.type to filter on.
|
hardcoded string every night. Sources cited in dream_observation.py.
|
||||||
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
|
|
||||||
if substrate == "graphiti":
|
Falls back to recent_questions from the signal if the LLM call fails."""
|
||||||
return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
|
import anthropic
|
||||||
embedder = _get_embedder()
|
|
||||||
low, high = MODE_RANGES[mode]
|
|
||||||
|
|
||||||
if task:
|
if task:
|
||||||
query = task
|
# Lucid mode: decompose the user's task into sub-queries
|
||||||
elif mode == "late-rem":
|
prompt = (
|
||||||
delta = observe_corpus()
|
f"Decompose this user task into {n_queries} distinct sub-questions, "
|
||||||
topics = delta.get("recent_topics", [])
|
f"each suitable as a retrieval query against Aaron's personal corpus.\n\n"
|
||||||
query = topics[0] if topics else "practice place memory making"
|
f"TASK: {task}\n\n"
|
||||||
elif mode == "early-rem":
|
f'Output JSON ONLY: {{"queries": ["...", "...", ...]}}'
|
||||||
query = "career decision personal change what matters next"
|
)
|
||||||
else:
|
else:
|
||||||
query = "research fabrication teaching practice recent work"
|
mode_framings = {
|
||||||
|
"nrem": (
|
||||||
|
"NREM is replay-and-consolidation of RECENT traces. Generate queries "
|
||||||
|
"that probe what Aaron has been working on or capturing in the last "
|
||||||
|
"few days. Concrete entities — project names, course codes, named "
|
||||||
|
"subjects. The dreamer is re-touching specific recent material to "
|
||||||
|
"strengthen schema connections, not finding novel content."
|
||||||
|
),
|
||||||
|
"early-rem": (
|
||||||
|
"Early REM is associative bridging with emotional/personal register. "
|
||||||
|
"Generate queries that surface unresolved themes, career questions, "
|
||||||
|
"ongoing personal threads — material that connects intellectual and "
|
||||||
|
"emotional dimensions. Tone: thoughtful friend, not researcher."
|
||||||
|
),
|
||||||
|
"late-rem": (
|
||||||
|
"Late REM tests novel connections across DISTANT material. Generate "
|
||||||
|
"queries that pair concrete subjects from DIFFERENT domains of Aaron's "
|
||||||
|
"work (e.g., one from academic teaching, one from consulting, one from "
|
||||||
|
"creative practice) to probe for surprising structural similarity. "
|
||||||
|
"Cross-domain is required."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
framing = mode_framings.get(mode, mode_framings["nrem"])
|
||||||
|
questions_snippet = "\n".join(
|
||||||
|
f" - {q[:200]}" for q in signal.get("recent_questions", [])[:8]
|
||||||
|
) or " (no recent user questions)"
|
||||||
|
journal_snippet = ", ".join(signal.get("new_journal_entries", [])[:5]) or "(none)"
|
||||||
|
days_str = (
|
||||||
|
f"{signal['days_since_dream']:.1f}"
|
||||||
|
if signal.get("days_since_dream") not in (None, float("inf"))
|
||||||
|
else "infinite (first dream)"
|
||||||
|
)
|
||||||
|
prompt = (
|
||||||
|
f"You generate retrieval queries for an Active Inference dreamer. The "
|
||||||
|
f"dreamer surfaces prediction errors — gaps between Aaron's model and "
|
||||||
|
f"reality — not summaries or generic associations.\n\n"
|
||||||
|
f"MODE: {mode}\n"
|
||||||
|
f"FRAMING: {framing}\n\n"
|
||||||
|
f"OBSERVATION SIGNAL:\n"
|
||||||
|
f"- Days since last dream: {days_str}\n"
|
||||||
|
f"- New chunks since last dream: {signal.get('new_chunks', 0)}\n"
|
||||||
|
f"- New journal entries: {journal_snippet}\n"
|
||||||
|
f"- Underprocessed chunks pool: {signal.get('underprocessed_count', 0):,}\n\n"
|
||||||
|
f"RECENT USER QUESTIONS (last 14 days, top 8):\n{questions_snippet}\n\n"
|
||||||
|
f"Generate {n_queries} retrieval queries. Requirements:\n"
|
||||||
|
f"- Use concrete entities, named projects, course codes, specific topics "
|
||||||
|
f"— NOT generic phrasing like 'research work practice'\n"
|
||||||
|
f"- Each query probes a DIFFERENT corner of recent activity\n"
|
||||||
|
f"- Match the {mode} framing\n"
|
||||||
|
f"- 5-15 words each\n\n"
|
||||||
|
f'Output JSON ONLY: {{"queries": ["...", "...", ...]}}'
|
||||||
|
)
|
||||||
|
|
||||||
embedding = embedder.encode([query]).tolist()[0]
|
try:
|
||||||
chunks = []
|
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
||||||
seen_sources = set()
|
resp = client.messages.create(
|
||||||
|
model=LLM_QUERY_MODEL,
|
||||||
|
max_tokens=512,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
)
|
||||||
|
text = "".join(b.text for b in resp.content if hasattr(b, "text")).strip()
|
||||||
|
if text.startswith("```"):
|
||||||
|
text = text.split("```", 2)[1]
|
||||||
|
if text.startswith("json"):
|
||||||
|
text = text[4:]
|
||||||
|
text = text.strip()
|
||||||
|
data = json.loads(text)
|
||||||
|
queries = data.get("queries", [])
|
||||||
|
if isinstance(queries, list) and queries:
|
||||||
|
return [str(q).strip() for q in queries[:n_queries] if str(q).strip()]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[dream] LLM query generation failed ({e}); falling back to recent questions")
|
||||||
|
|
||||||
|
fallback = signal.get("recent_questions", [])[:n_queries] if signal else []
|
||||||
|
return fallback or [task or "recent activity decisions thinking"]
|
||||||
|
|
||||||
|
|
||||||
|
def _mmr_select(candidate_embeddings, query_embedding, n, lambda_=MMR_LAMBDA):
|
||||||
|
"""Maximal Marginal Relevance — greedy selection that balances relevance
|
||||||
|
against pairwise diversity. Carbonell & Goldstein 1998. Used to prevent
|
||||||
|
cluster lock-in (e.g., 8 dossier-narrative variants filling all 8 slots).
|
||||||
|
|
||||||
|
candidate_embeddings: (N, D) numpy array
|
||||||
|
query_embedding: (D,) numpy array
|
||||||
|
Returns: list of indices into candidate_embeddings, len ≤ n."""
|
||||||
|
if len(candidate_embeddings) == 0:
|
||||||
|
return []
|
||||||
|
n = min(n, len(candidate_embeddings))
|
||||||
|
cands = candidate_embeddings / (np.linalg.norm(candidate_embeddings, axis=1, keepdims=True) + 1e-9)
|
||||||
|
q = query_embedding / (np.linalg.norm(query_embedding) + 1e-9)
|
||||||
|
relevance = cands @ q
|
||||||
|
selected = []
|
||||||
|
remaining = list(range(len(cands)))
|
||||||
|
while len(selected) < n and remaining:
|
||||||
|
if not selected:
|
||||||
|
best = max(remaining, key=lambda i: relevance[i])
|
||||||
|
else:
|
||||||
|
sel = cands[selected]
|
||||||
|
scores = {
|
||||||
|
i: lambda_ * relevance[i] - (1 - lambda_) * float((cands[i] @ sel.T).max())
|
||||||
|
for i in remaining
|
||||||
|
}
|
||||||
|
best = max(scores, key=scores.get)
|
||||||
|
selected.append(best)
|
||||||
|
remaining.remove(best)
|
||||||
|
return selected
|
||||||
|
|
||||||
|
|
||||||
|
def _bump_consolidation_cursor(chunks):
|
||||||
|
"""Increment consolidation_count + set last_consolidated_at=NOW() for each
|
||||||
|
source represented in chunks. Called from dream_pipeline after NREM
|
||||||
|
completes. Per sharp-wave-ripples biology, NREM does the actual
|
||||||
|
consolidation; REM is associative use, so we only bump on NREM."""
|
||||||
|
if not chunks:
|
||||||
|
return
|
||||||
|
sources = list({c["source"] for c in chunks if c.get("source")})
|
||||||
|
if not sources:
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
pg = get_pg()
|
pg = get_pg()
|
||||||
cur = pg.cursor()
|
cur = pg.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE embeddings "
|
||||||
|
"SET consolidation_count = consolidation_count + 1, "
|
||||||
|
" last_consolidated_at = NOW() "
|
||||||
|
"WHERE source = ANY(%s)",
|
||||||
|
(sources,),
|
||||||
|
)
|
||||||
|
pg.commit()
|
||||||
|
pg.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[dream] cursor bump failed (non-fatal): {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def retrieve(mode, task=None, n_results=8, excluded_sources=None,
|
||||||
|
type_filter=None, signal=None):
|
||||||
|
"""Refactored retrieval — see dreamer-design-spec.md Stage 3 + the
|
||||||
|
external-literature prescription in birdai-dreamer-exclusion-finding-2026-05-02.md.
|
||||||
|
|
||||||
|
Changes from the prior hardcoded-query version:
|
||||||
|
- Queries are LLM-generated from the observation signal (Park et al.
|
||||||
|
reflection pattern) instead of fixed strings. Solves the "same 8 sources
|
||||||
|
every night" failure where fixed seeds locked into one neighborhood.
|
||||||
|
- Per-mode time windows (24-72hr NREM / 30d Early REM / 90d Late REM)
|
||||||
|
filter candidates before vector search. Spec calls for these to be
|
||||||
|
mutable; they live in TIME_WINDOWS_HOURS.
|
||||||
|
- NREM biases toward under-processed chunks (low consolidation_count).
|
||||||
|
Biologically motivated: sharp-wave ripples tag what to replay, not
|
||||||
|
uniform sampling.
|
||||||
|
- Multiple queries (4 by default) → over-fetch → MMR merge for
|
||||||
|
within-night diversity. Prevents cluster domination.
|
||||||
|
|
||||||
|
signal is the observation-signal dict from dream_observation.observe_corpus().
|
||||||
|
If None, observe_corpus is called inline (back-compat for ad-hoc invocation).
|
||||||
|
"""
|
||||||
|
# E3 substrate experiment unchanged
|
||||||
|
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
|
||||||
|
if substrate == "graphiti":
|
||||||
|
return retrieve_graphiti(mode, task=task, n_results=n_results,
|
||||||
|
excluded_sources=excluded_sources)
|
||||||
|
|
||||||
|
if signal is None:
|
||||||
|
from dream_observation import observe_corpus as _obs
|
||||||
|
signal = _obs()
|
||||||
|
|
||||||
|
queries = _llm_generate_queries(mode, signal, task=task, n_queries=4)
|
||||||
|
if not queries:
|
||||||
|
print(f"[dream:{mode}] no queries generated; bailing")
|
||||||
|
return []
|
||||||
|
print(f"[dream:{mode}] generated queries: {queries}")
|
||||||
|
|
||||||
|
embedder = _get_embedder()
|
||||||
excluded_sources = excluded_sources or set()
|
excluded_sources = excluded_sources or set()
|
||||||
|
window_hours = TIME_WINDOWS_HOURS.get(mode)
|
||||||
|
per_query_n = 12 # over-fetch for MMR
|
||||||
|
|
||||||
|
candidates = []
|
||||||
|
seen_ids = set()
|
||||||
|
try:
|
||||||
|
pg = get_pg()
|
||||||
|
cur = pg.cursor()
|
||||||
|
for q in queries:
|
||||||
|
q_emb = embedder.encode([q]).tolist()[0]
|
||||||
where, params = [], []
|
where, params = [], []
|
||||||
if excluded_sources:
|
if excluded_sources:
|
||||||
where.append("source NOT IN %s")
|
where.append("source NOT IN %s")
|
||||||
@@ -326,33 +518,85 @@ def retrieve(mode, task=None, n_results=8, excluded_sources=None, type_filter=No
|
|||||||
if type_filter:
|
if type_filter:
|
||||||
where.append("type = ANY(%s)")
|
where.append("type = ANY(%s)")
|
||||||
params.append(list(type_filter))
|
params.append(list(type_filter))
|
||||||
|
if window_hours is not None:
|
||||||
|
# created_at is TEXT (legacy); cast it. NULL created_at fails
|
||||||
|
# the comparison so legacy rows are excluded from windowed
|
||||||
|
# modes — correct: NULL means "indexed before cursor existed,"
|
||||||
|
# which by definition is older than any window.
|
||||||
|
where.append(
|
||||||
|
f"(created_at IS NOT NULL AND "
|
||||||
|
f"created_at::timestamptz > NOW() - INTERVAL '{int(window_hours)} hours')"
|
||||||
|
)
|
||||||
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
|
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
|
||||||
|
# NREM bias: order by consolidation_count ASC first (under-processed
|
||||||
|
# chunks win the tiebreak before vector distance). Other modes:
|
||||||
|
# vector distance only.
|
||||||
|
order_clause = (
|
||||||
|
"ORDER BY consolidation_count ASC, embedding <=> %s::vector"
|
||||||
|
if mode == "nrem"
|
||||||
|
else "ORDER BY embedding <=> %s::vector"
|
||||||
|
)
|
||||||
cur.execute(f"""
|
cur.execute(f"""
|
||||||
SELECT document, source, type, 1 - (embedding <=> %s::vector) as similarity
|
SELECT id, document, source, type, embedding,
|
||||||
|
1 - (embedding <=> %s::vector) as similarity
|
||||||
FROM embeddings
|
FROM embeddings
|
||||||
{where_clause}
|
{where_clause}
|
||||||
ORDER BY embedding <=> %s::vector
|
{order_clause}
|
||||||
LIMIT %s
|
LIMIT %s
|
||||||
""", [embedding, *params, embedding, n_results * 3])
|
""", [q_emb, *params, q_emb, per_query_n])
|
||||||
|
for row in cur.fetchall():
|
||||||
for doc, source, etype, similarity in cur.fetchall():
|
if row[0] in seen_ids:
|
||||||
if not (low <= similarity <= high):
|
|
||||||
continue
|
continue
|
||||||
if source in seen_sources:
|
seen_ids.add(row[0])
|
||||||
continue
|
emb = row[4]
|
||||||
chunks.append({
|
# pgvector returns embeddings as string "[...]" by default
|
||||||
"source": source or "unknown",
|
if isinstance(emb, str):
|
||||||
"content": doc,
|
emb = np.array([float(x) for x in emb.strip("[]").split(",")])
|
||||||
"relevance": similarity,
|
else:
|
||||||
"similarity": similarity,
|
emb = np.array(emb)
|
||||||
"type": etype,
|
candidates.append({
|
||||||
|
"id": row[0],
|
||||||
|
"content": row[1],
|
||||||
|
"source": row[2] or "unknown",
|
||||||
|
"type": row[3],
|
||||||
|
"embedding": emb,
|
||||||
|
"similarity": float(row[5]),
|
||||||
})
|
})
|
||||||
seen_sources.add(source)
|
|
||||||
if len(chunks) >= n_results:
|
|
||||||
break
|
|
||||||
pg.close()
|
pg.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"pgvector retrieval error: {e}")
|
import traceback
|
||||||
|
print(f"[dream:{mode}] retrieval SQL error: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
print(f"[dream:{mode}] zero candidates after filters")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# MMR over the union, using the first query as pivot for the relevance term.
|
||||||
|
# Averaging query embeddings would be theoretically cleaner but adds
|
||||||
|
# complexity for marginal benefit at this scale.
|
||||||
|
pivot_emb = np.array(embedder.encode([queries[0]]).tolist()[0])
|
||||||
|
cand_embs = np.array([c["embedding"] for c in candidates])
|
||||||
|
selected_idx = _mmr_select(cand_embs, pivot_emb, n=n_results * 2)
|
||||||
|
|
||||||
|
# Post-MMR source-level dedup (multi-chunk same source collapses to one).
|
||||||
|
chunks = []
|
||||||
|
seen_sources = set()
|
||||||
|
for i in selected_idx:
|
||||||
|
c = candidates[i]
|
||||||
|
if c["source"] in seen_sources:
|
||||||
|
continue
|
||||||
|
seen_sources.add(c["source"])
|
||||||
|
chunks.append({
|
||||||
|
"source": c["source"],
|
||||||
|
"content": c["content"],
|
||||||
|
"relevance": c["similarity"],
|
||||||
|
"similarity": c["similarity"],
|
||||||
|
"type": c["type"],
|
||||||
|
})
|
||||||
|
if len(chunks) >= n_results:
|
||||||
|
break
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
@@ -496,6 +740,12 @@ def dream_pipeline(type_filter=None):
|
|||||||
"""
|
"""
|
||||||
Full nightly pipeline — interdependent stages.
|
Full nightly pipeline — interdependent stages.
|
||||||
NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
|
NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
|
||||||
|
|
||||||
|
Per dreamer-design-spec.md, this now runs Stage 1 (observe) and Stage 2
|
||||||
|
(select) first. If select_mode returns None — corpus unchanged and no new
|
||||||
|
journal entry — the dreamer goes quiet rather than manufacturing novelty.
|
||||||
|
Otherwise NREM/Early-REM/Late-REM run with LLM-generated queries seeded
|
||||||
|
from the observation signal.
|
||||||
"""
|
"""
|
||||||
print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||||||
|
|
||||||
@@ -503,21 +753,47 @@ def dream_pipeline(type_filter=None):
|
|||||||
state.pop("retrieved_sources", None) # legacy key; session-scoped novelty now
|
state.pop("retrieved_sources", None) # legacy key; session-scoped novelty now
|
||||||
session_retrieved = set()
|
session_retrieved = set()
|
||||||
|
|
||||||
delta = observe_corpus()
|
# ── Stage 1 + 2: Observe + Select ──────────────────────────────────────
|
||||||
print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
|
from dream_observation import observe_corpus as _obs, select_mode as _select
|
||||||
print("Novelty: session-scoped (no across-night exclusion)")
|
signal = _obs()
|
||||||
|
print(
|
||||||
|
f"Signal: new_chunks={signal['new_chunks']}, "
|
||||||
|
f"new_journal={len(signal['new_journal_entries'])}, "
|
||||||
|
f"days_since={signal['days_since_dream']:.1f}, "
|
||||||
|
f"underprocessed={signal['underprocessed_count']:,}"
|
||||||
|
)
|
||||||
|
selected = _select(signal)
|
||||||
|
if selected is None:
|
||||||
|
print("[select_mode] None — nothing worth dreaming about tonight (going quiet)")
|
||||||
|
# Update last-dream-attempted-at but not last_dream — caller can distinguish
|
||||||
|
# an actual dream from a skipped night by looking at last_dream_file or
|
||||||
|
# checking the manifest dir.
|
||||||
|
state["last_select_quiet_at"] = datetime.now().isoformat()
|
||||||
|
save_dreamer_state(state)
|
||||||
|
return None
|
||||||
|
print(f"[select_mode] → {selected}")
|
||||||
|
|
||||||
# ── Stage 1: NREM ──────────────────────────────────────────────────────
|
# The pipeline always runs all three modes for the manifest's continuity.
|
||||||
|
# select_mode's choice signals the *primary* focus; the others still run
|
||||||
|
# but draw from their own mode-appropriate windows.
|
||||||
|
primary_mode = selected
|
||||||
|
|
||||||
|
# ── Stage 3: NREM ──────────────────────────────────────────────────────
|
||||||
print("\n[NREM] Retrieving...")
|
print("\n[NREM] Retrieving...")
|
||||||
# NREM is replay-and-consolidation — does not exclude prior traces.
|
# NREM is replay-and-consolidation — does not exclude prior traces.
|
||||||
# Late REM and Early REM exclude prior content for novelty; NREM does not.
|
# Late REM and Early REM exclude prior content for novelty; NREM does not.
|
||||||
nrem_chunks = retrieve("nrem", excluded_sources=None, type_filter=type_filter)
|
nrem_chunks = retrieve("nrem", excluded_sources=None,
|
||||||
|
type_filter=type_filter, signal=signal)
|
||||||
session_retrieved.update(c["source"] for c in nrem_chunks)
|
session_retrieved.update(c["source"] for c in nrem_chunks)
|
||||||
# Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude
|
# Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude
|
||||||
nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55}
|
nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55}
|
||||||
if not nrem_chunks:
|
if not nrem_chunks:
|
||||||
print("[NREM] No suitable chunks — aborting pipeline")
|
print("[NREM] No suitable chunks — aborting pipeline")
|
||||||
return None
|
return None
|
||||||
|
# Cursor bump: NREM is the consolidation stage. Each appearance increments
|
||||||
|
# consolidation_count + updates last_consolidated_at, so the next dream's
|
||||||
|
# observation sees these sources as less under-processed.
|
||||||
|
_bump_consolidation_cursor(nrem_chunks)
|
||||||
|
|
||||||
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
|
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
|
||||||
nrem_output = synthesize_nrem(nrem_chunks)
|
nrem_output = synthesize_nrem(nrem_chunks)
|
||||||
@@ -528,7 +804,7 @@ def dream_pipeline(type_filter=None):
|
|||||||
"nrem": {
|
"nrem": {
|
||||||
"chunks_retrieved": len(nrem_chunks),
|
"chunks_retrieved": len(nrem_chunks),
|
||||||
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
|
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
|
||||||
"query": "research fabrication teaching practice recent work",
|
"query": "[llm-generated from observation signal]",
|
||||||
"word_count": len(nrem_output.split()),
|
"word_count": len(nrem_output.split()),
|
||||||
"sources": nrem_sources,
|
"sources": nrem_sources,
|
||||||
"distinct_folders": nrem_folders,
|
"distinct_folders": nrem_folders,
|
||||||
@@ -546,7 +822,8 @@ def dream_pipeline(type_filter=None):
|
|||||||
print("\n[Early REM] Retrieving...")
|
print("\n[Early REM] Retrieving...")
|
||||||
# Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
|
# Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
|
||||||
# Sources that scored in Early REM band during NREM remain available
|
# Sources that scored in Early REM band during NREM remain available
|
||||||
early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources, type_filter=type_filter)
|
early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources,
|
||||||
|
type_filter=type_filter, signal=signal)
|
||||||
session_retrieved.update(c["source"] for c in early_chunks)
|
session_retrieved.update(c["source"] for c in early_chunks)
|
||||||
if not early_chunks:
|
if not early_chunks:
|
||||||
print("[Early REM] No suitable chunks — skipping")
|
print("[Early REM] No suitable chunks — skipping")
|
||||||
@@ -560,7 +837,7 @@ def dream_pipeline(type_filter=None):
|
|||||||
stage_data["early_rem"] = {
|
stage_data["early_rem"] = {
|
||||||
"chunks_retrieved": len(early_chunks),
|
"chunks_retrieved": len(early_chunks),
|
||||||
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
|
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
|
||||||
"query": "career decision personal change what matters next",
|
"query": "[llm-generated from observation signal]",
|
||||||
"word_count": len(early_rem_output.split()),
|
"word_count": len(early_rem_output.split()),
|
||||||
"sources": early_sources,
|
"sources": early_sources,
|
||||||
"distinct_folders": early_folders,
|
"distinct_folders": early_folders,
|
||||||
@@ -572,7 +849,8 @@ def dream_pipeline(type_filter=None):
|
|||||||
|
|
||||||
# ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
|
# ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
|
||||||
print("\n[Late REM] Retrieving...")
|
print("\n[Late REM] Retrieving...")
|
||||||
late_chunks = retrieve("late-rem", excluded_sources=session_retrieved, type_filter=type_filter)
|
late_chunks = retrieve("late-rem", excluded_sources=session_retrieved,
|
||||||
|
type_filter=type_filter, signal=signal)
|
||||||
session_retrieved.update(c["source"] for c in late_chunks)
|
session_retrieved.update(c["source"] for c in late_chunks)
|
||||||
if not late_chunks:
|
if not late_chunks:
|
||||||
print("[Late REM] No suitable chunks — skipping")
|
print("[Late REM] No suitable chunks — skipping")
|
||||||
@@ -591,7 +869,7 @@ def dream_pipeline(type_filter=None):
|
|||||||
stage_data["late_rem"] = {
|
stage_data["late_rem"] = {
|
||||||
"chunks_retrieved": len(late_chunks),
|
"chunks_retrieved": len(late_chunks),
|
||||||
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
|
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
|
||||||
"query": "practice place memory making",
|
"query": "[llm-generated from observation signal]",
|
||||||
"word_count": len(late_rem_output.split()),
|
"word_count": len(late_rem_output.split()),
|
||||||
"sources": late_sources,
|
"sources": late_sources,
|
||||||
"distinct_folders": list(set(late_folders)),
|
"distinct_folders": list(set(late_folders)),
|
||||||
|
|||||||
Reference in New Issue
Block a user