d985f9e91e
Two correctness bugs in dream_pipeline manifest assembly. write_manifest at lines 487-491 swallowed HTTP 4xx/5xx responses silently. requests.put() only raises on transport-level errors (DNS, connection refused, timeout); 401/403/500/507 come back as Response objects and never trigger the except. The code printed "Manifest written" while the manifest never persisted. The same file's deliver() function at line 434 already used response.raise_for_status() — the pattern was already established, write_manifest just skipped it. Fix: bind the response and call raise_for_status() before the success print. The except message changes from "(non-critical)" to "manifest not persisted" because HTTP failure now means manifest data was lost, which is critical, not quiet. corpus_data["total_chunks"] at lines 621-622 stored delta["new_chunks"], duplicating the sibling field new_chunks_since_last_dream. The field name claimed absolute corpus size; the value was a delta of recently-touched files. Verified in live manifests: total_chunks: 0 while pgvector held 11,379+ document embeddings. Fix: query SELECT COUNT(*) FROM embeddings inside dream_pipeline, store as total_chunks. Tightly-scoped one-shot connect via the existing get_pg() helper. Telemetry query failure is treated as non-critical and falls back to 0 — pgvector hiccup should not crash an otherwise successful dream pipeline. Bonus finding (not fixed in this commit): new_chunks_since_last_dream is itself misnamed. observe_corpus() reads the watcher's mtime cache and counts files (not chunks) whose mtime is newer than last_dream. Both fields were "files touched since last dream" duplicated under two different names; this commit fixes only the total_chunks semantics. Renaming new_chunks_since_last_dream is out of scope — manifests are write-only telemetry today, no consumer reads either field, and the rename is a separate decision. Verification: real pipeline run produced manifest with total_chunks matching SELECT COUNT(*) directly; doubled as a smoke test for the embedder cache (single Loading weights line), type_distribution propagation, and the manifest write success path.
743 lines
30 KiB
Python
743 lines
30 KiB
Python
"""
|
|
Aaron AI Dreamer — Active Inference Engine
|
|
Interdependent stage architecture grounded in sleep consolidation research.
|
|
|
|
Nightly pipeline: NREM → Early REM → Late REM → Synthesis
|
|
Each stage receives the previous stage's output as context.
|
|
Lucid mode is on-demand only (Dream Now from settings).
|
|
|
|
Research basis:
|
|
- Singh et al. PNAS 2022: alternating NREM/REM outperforms single-stage approaches
|
|
- Klinzing et al. Nature Neuroscience 2019: SO-spindle-ripple coupling is interdependent
|
|
- REM operates on what NREM produced — stages are not discrete alternatives
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import sqlite3
|
|
import argparse
|
|
from functools import lru_cache
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from dotenv import load_dotenv
|
|
import psycopg2
|
|
import hashlib
|
|
|
|
load_dotenv(Path.home() / "aaronai" / ".env", override=True)
|
|
|
|
PG_DSN = os.getenv("PG_DSN")
|
|
|
|
def get_pg():
|
|
return psycopg2.connect(PG_DSN)
|
|
|
|
# ─── Paths ──────────────────────────────────────────────────────────────────
|
|
CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
|
|
WATCHER_STATE = str(Path.home() / "aaronai" / "watcher_state.json")
|
|
DREAMER_STATE = str(Path.home() / "aaronai" / "dreamer_state.json")
|
|
JOURNAL_DIR = "/home/aaron/nextcloud/data/data/aaron/files/Journal/Daily"
|
|
|
|
NEXTCLOUD_URL = os.getenv("NEXTCLOUD_URL", "https://nextcloud.aaronnelson.studio")
|
|
NEXTCLOUD_USER = os.getenv("NEXTCLOUD_USER", "aaron")
|
|
NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "")
|
|
DREAMS_WEBDAV = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams"
|
|
|
|
# Similarity ranges calibrated for all-MiniLM-L6-v2
|
|
MODE_RANGES = {
|
|
"nrem": (0.48, 0.72),
|
|
"early-rem": (0.38, 0.55),
|
|
"late-rem": (0.22, 0.42),
|
|
"lucid": (0.32, 0.72),
|
|
}
|
|
DREAMER_VERSION = "1.1" # 1.0=original exclusion logic; 1.1=score-band exclusion
|
|
|
|
# ─── Prompt versioning ──────────────────────────────────────────────────────
|
|
# Bump the relevant constant manually when changing a prompt.
|
|
PROMPT_VERSION_NREM = "1.0"
|
|
PROMPT_VERSION_EREM = "1.1"
|
|
PROMPT_VERSION_LREM = "1.2"
|
|
PROMPT_VERSION_SYN = "1.0"
|
|
|
|
def prompt_signature():
|
|
return (f"nrem={PROMPT_VERSION_NREM}|erem={PROMPT_VERSION_EREM}"
|
|
f"|lrem={PROMPT_VERSION_LREM}|syn={PROMPT_VERSION_SYN}")
|
|
|
|
def prompt_hash(prompts: list[str]) -> str:
|
|
combined = "".join(prompts)
|
|
return hashlib.md5(combined.encode()).hexdigest()[:8]
|
|
|
|
# ─── Prompt templates ───────────────────────────────────────────────────────
|
|
# Module-level so prompt_hash() can hash actual prompt content. Any change to
|
|
# any template — even a single character — flips the manifest's prompt_hash.
|
|
# Templates use str.format() placeholders ({chunk_text}, {nrem_output}, ...);
|
|
# do not switch back to f-strings (the constant must be hashable independent
|
|
# of variable values). Literal { or } in template text would need to be
|
|
# doubled ({{, }}) — currently no template contains literal braces.
|
|
|
|
NREM_PROMPT_TEMPLATE = """You have read everything Aaron Nelson has written and published.
|
|
You are a careful colleague who noticed something this week.
|
|
|
|
Here is material from his corpus:
|
|
|
|
{chunk_text}
|
|
|
|
Write to Aaron directly. Identify one specific connection between
|
|
this material and something he wrote or worked on previously.
|
|
Stay close to the documents — cite them specifically by name.
|
|
Do not speculate beyond what the material supports. Do not use
|
|
headers or bullet points. Write one paragraph of 200-300 words
|
|
that ends with a single concrete question he could act on."""
|
|
|
|
EARLY_REM_PROMPT_TEMPLATE = """Something was noticed earlier tonight, moving through Aaron's recent work:
|
|
|
|
{nrem_output}
|
|
|
|
That observation is still with you. Now here is material from a different
|
|
time — pulled from further back, from different parts of his corpus:
|
|
|
|
{chunk_text}
|
|
|
|
You are not analyzing. You are recognizing.
|
|
|
|
Something in the earlier observation and something in this older material
|
|
are the same thing wearing different clothes. Find it. Don't explain why
|
|
they're connected — just let the connection speak. Write from inside the
|
|
recognition, not from above it.
|
|
|
|
The emotional register underneath the career logic is more interesting
|
|
than the career logic. The pattern that has been repeating longer than
|
|
he has been aware of it is more interesting than the current instance.
|
|
|
|
Write directly to Aaron. No citations, no references, no analysis.
|
|
First person, present tense. Let what you noticed arrive rather than
|
|
be delivered. 150-250 words. End with one thing that is true that
|
|
he probably already knows but hasn't said out loud yet."""
|
|
|
|
LATE_REM_PROMPT_TEMPLATE = """You have been moving through Aaron Nelson's corpus all night.
|
|
First you found this, in the careful light of early consolidation:
|
|
|
|
{nrem_output}
|
|
|
|
Then, in the more personal territory that followed:
|
|
|
|
{early_rem_output}
|
|
|
|
Now it is late. The boundaries between things have loosened.
|
|
Here is material pulled from opposite ends of his work:
|
|
|
|
{chunk_text}
|
|
|
|
Do not explain the connections between all of this.
|
|
Do not resolve them. Do not summarize what came before.
|
|
Something stranger is possible now — let the accumulated
|
|
material from the night find its own shape. Compressed,
|
|
associative, slightly off. Let the strangeness stand.
|
|
|
|
No headers. No bullet points. No hedging. No resolution.
|
|
No offer. End mid-thought if that is where the material ends.
|
|
150-250 words."""
|
|
|
|
SYNTHESIS_PROMPT_TEMPLATE = """You have spent the night moving through Aaron Nelson's corpus
|
|
in three passes, each building on the last.
|
|
|
|
The first pass — careful, close to the documents:
|
|
{nrem_output}
|
|
|
|
The second pass — more personal, following what the first opened:
|
|
{early_rem_output}
|
|
|
|
The third pass — associative, strange, letting things touch that
|
|
don't normally touch:
|
|
{late_rem_output}
|
|
|
|
Now synthesize. Not a summary — a synthesis. Find what runs through
|
|
all three that none of them said directly. The thing that only becomes
|
|
visible when you hold all three passes together.
|
|
|
|
Write it as a single unbroken piece. No headers, no bullet points,
|
|
no stage labels. 200-300 words. End with the one question that
|
|
matters most right now."""
|
|
|
|
LUCID_PROMPT_TEMPLATE = """Aaron has a question he is sitting with:
|
|
|
|
{task}
|
|
|
|
You have searched his entire corpus and found material that
|
|
speaks to this question from unexpected directions. Here is
|
|
what you found:
|
|
|
|
{chunk_text}
|
|
|
|
Do not summarize. Do not list. Pick the most interesting
|
|
tension between what the corpus contains and what he is
|
|
asking, and follow it through to its conclusion. Cite
|
|
specific documents by name. Be direct about what you think.
|
|
No headers, no bullet points. 250-400 words.
|
|
End with an offer to work on it together."""
|
|
|
|
LUCID_DEFAULT_TASK = "What should I be thinking about that I am not?"
|
|
|
|
def extract_folder(source_path):
|
|
"""Extract top-level Nextcloud folder from source path."""
|
|
parts = source_path.replace("\\", "/").split("/")
|
|
return parts[0] if parts else "unknown"
|
|
|
|
# ─── Stage 1: Observe ───────────────────────────────────────────────────────
|
|
|
|
def observe_corpus():
|
|
state = load_dreamer_state()
|
|
last_dream = state.get("last_dream_timestamp", 0)
|
|
new_chunk_count = 0
|
|
try:
|
|
watcher_state = json.loads(Path(WATCHER_STATE).read_text())
|
|
for path, mtime in watcher_state.items():
|
|
if float(mtime) > last_dream:
|
|
new_chunk_count += 1
|
|
except:
|
|
pass
|
|
days_since = (datetime.now().timestamp() - last_dream) / 86400
|
|
recent_topics = get_recent_conversation_topics()
|
|
return {
|
|
"new_chunks": new_chunk_count,
|
|
"days_since_dream": days_since,
|
|
"recent_topics": recent_topics,
|
|
"last_dream": last_dream,
|
|
}
|
|
|
|
def get_recent_conversation_topics(days=14):
|
|
try:
|
|
conn = sqlite3.connect(CONVERSATIONS_DB)
|
|
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
|
|
c = conn.cursor()
|
|
c.execute("""
|
|
SELECT m.content FROM messages m
|
|
JOIN conversations c ON m.conversation_id = c.id
|
|
WHERE m.role = 'user' AND c.updated_at > ?
|
|
ORDER BY m.timestamp DESC LIMIT 20
|
|
""", (cutoff,))
|
|
rows = c.fetchall()
|
|
conn.close()
|
|
return [r[0][:200] for r in rows]
|
|
except:
|
|
return []
|
|
|
|
# ─── Stage 2: Retrieve ──────────────────────────────────────────────────────
|
|
|
|
|
|
def retrieve_graphiti(mode, task=None, n_results=8, excluded_sources=None):
|
|
"""E3 experiment — Graphiti substrate retrieval.
|
|
Queries Graphiti /search endpoint instead of pgvector.
|
|
Returns chunks in same format as retrieve() for pipeline compatibility.
|
|
Note: content is Graphiti facts (synthesized relationships), not raw chunks.
|
|
|
|
Over-fetches by 3x to allow in-process filtering against excluded_sources,
|
|
matching the cross-stage exclusion mechanism the pgvector branch uses.
|
|
Without this filter, NREM/Early REM/Late REM would see overlapping content
|
|
and the score-band Early REM exclusion (v1.1) would not apply in Graphiti mode.
|
|
"""
|
|
import requests as req_lib
|
|
if task:
|
|
query = task
|
|
elif mode == "late-rem":
|
|
delta = observe_corpus()
|
|
topics = delta.get("recent_topics", [])
|
|
query = topics[0] if topics else "practice place memory making"
|
|
elif mode == "early-rem":
|
|
query = "career decision personal change what matters next"
|
|
else:
|
|
query = "research fabrication teaching practice recent work"
|
|
|
|
excluded_sources = excluded_sources or set()
|
|
# Over-fetch so in-process exclusion still leaves enough results
|
|
fetch_limit = n_results * 3 if excluded_sources else n_results
|
|
|
|
try:
|
|
resp = req_lib.get(
|
|
"http://localhost:8001/search",
|
|
params={"query": query, "limit": fetch_limit, "group_id": "aaron"},
|
|
timeout=30,
|
|
)
|
|
resp.raise_for_status()
|
|
results = resp.json().get("results", [])
|
|
chunks = []
|
|
seen_sources = set()
|
|
for r in results:
|
|
fact = r.get("fact", "")
|
|
if not fact.strip():
|
|
continue
|
|
source = r.get("source", "graphiti")
|
|
if source in excluded_sources:
|
|
continue
|
|
if source in seen_sources:
|
|
continue
|
|
chunks.append({
|
|
"source": source,
|
|
"content": fact,
|
|
"relevance": r.get("score", 0.5),
|
|
"similarity": r.get("score", 0.5),
|
|
})
|
|
seen_sources.add(source)
|
|
if len(chunks) >= n_results:
|
|
break
|
|
return chunks
|
|
except Exception as e:
|
|
print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
|
|
return []
|
|
|
|
@lru_cache(maxsize=1)
|
|
def _get_embedder():
|
|
from sentence_transformers import SentenceTransformer
|
|
return SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
def retrieve(mode, task=None, n_results=8, excluded_sources=None, type_filter=None):
|
|
# E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search
|
|
# Default behavior: pgvector similarity search (unchanged)
|
|
# type_filter is experimental and applies to pgvector retrieval only — Graphiti
|
|
# facts are not embeddings rows and have no embeddings.type to filter on.
|
|
substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
|
|
if substrate == "graphiti":
|
|
return retrieve_graphiti(mode, task=task, n_results=n_results, excluded_sources=excluded_sources)
|
|
embedder = _get_embedder()
|
|
low, high = MODE_RANGES[mode]
|
|
|
|
if task:
|
|
query = task
|
|
elif mode == "late-rem":
|
|
delta = observe_corpus()
|
|
topics = delta.get("recent_topics", [])
|
|
query = topics[0] if topics else "practice place memory making"
|
|
elif mode == "early-rem":
|
|
query = "career decision personal change what matters next"
|
|
else:
|
|
query = "research fabrication teaching practice recent work"
|
|
|
|
embedding = embedder.encode([query]).tolist()[0]
|
|
chunks = []
|
|
seen_sources = set()
|
|
|
|
try:
|
|
pg = get_pg()
|
|
cur = pg.cursor()
|
|
excluded_sources = excluded_sources or set()
|
|
where, params = [], []
|
|
if excluded_sources:
|
|
where.append("source NOT IN %s")
|
|
params.append(tuple(excluded_sources))
|
|
if type_filter:
|
|
where.append("type = ANY(%s)")
|
|
params.append(list(type_filter))
|
|
where_clause = ("WHERE " + " AND ".join(where)) if where else ""
|
|
cur.execute(f"""
|
|
SELECT document, source, type, 1 - (embedding <=> %s::vector) as similarity
|
|
FROM embeddings
|
|
{where_clause}
|
|
ORDER BY embedding <=> %s::vector
|
|
LIMIT %s
|
|
""", [embedding, *params, embedding, n_results * 3])
|
|
|
|
for doc, source, etype, similarity in cur.fetchall():
|
|
if not (low <= similarity <= high):
|
|
continue
|
|
if source in seen_sources:
|
|
continue
|
|
chunks.append({
|
|
"source": source or "unknown",
|
|
"content": doc,
|
|
"relevance": similarity,
|
|
"similarity": similarity,
|
|
"type": etype,
|
|
})
|
|
seen_sources.add(source)
|
|
if len(chunks) >= n_results:
|
|
break
|
|
pg.close()
|
|
except Exception as e:
|
|
print(f"pgvector retrieval error: {e}")
|
|
|
|
return chunks
|
|
|
|
# ─── Stage 3: Synthesize ────────────────────────────────────────────────────
|
|
|
|
def synthesize_nrem(chunks):
|
|
chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
|
|
return _call_claude(NREM_PROMPT_TEMPLATE.format(chunk_text=chunk_text))
|
|
|
|
|
|
def synthesize_early_rem(chunks, nrem_output):
|
|
# v1.1 — removed citation instruction, removed close-friend persona,
|
|
# shifted register from analysis to recognition.
|
|
chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
|
|
return _call_claude(EARLY_REM_PROMPT_TEMPLATE.format(
|
|
nrem_output=nrem_output, chunk_text=chunk_text))
|
|
|
|
|
|
def synthesize_late_rem(chunks, nrem_output, early_rem_output):
|
|
chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
|
|
return _call_claude(LATE_REM_PROMPT_TEMPLATE.format(
|
|
nrem_output=nrem_output,
|
|
early_rem_output=early_rem_output,
|
|
chunk_text=chunk_text))
|
|
|
|
|
|
def synthesize_final(nrem_output, early_rem_output, late_rem_output):
|
|
return _call_claude(
|
|
SYNTHESIS_PROMPT_TEMPLATE.format(
|
|
nrem_output=nrem_output,
|
|
early_rem_output=early_rem_output,
|
|
late_rem_output=late_rem_output),
|
|
max_tokens=800)
|
|
|
|
|
|
def synthesize_lucid(chunks, task):
|
|
chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
|
|
resolved_task = task or LUCID_DEFAULT_TASK
|
|
return _call_claude(LUCID_PROMPT_TEMPLATE.format(
|
|
task=resolved_task, chunk_text=chunk_text))
|
|
|
|
|
|
def _call_claude(prompt, max_tokens=1000):
|
|
import anthropic
|
|
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
|
|
response = client.messages.create(
|
|
model="claude-sonnet-4-6",
|
|
max_tokens=max_tokens,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
return response.content[0].text
|
|
|
|
# ─── Stage 4: Deliver ───────────────────────────────────────────────────────
|
|
|
|
def deliver(dream_text, mode, task=None):
|
|
import requests
|
|
date_str = datetime.now().strftime("%Y-%m-%d")
|
|
filename = f"{date_str}-{mode}.md"
|
|
header = f"# Dream — {mode.upper()} — {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
|
|
header += f"*prompt_sig: {prompt_signature()}*\n\n"
|
|
if task:
|
|
header += f"*Task: {task}*\n\n"
|
|
header += "---\n\n"
|
|
content = header + dream_text
|
|
|
|
auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
|
|
requests.request("MKCOL", DREAMS_WEBDAV, auth=auth, timeout=10)
|
|
|
|
url = f"{DREAMS_WEBDAV}/{filename}"
|
|
counter = 1
|
|
while True:
|
|
check = requests.request("PROPFIND", url, auth=auth, timeout=10)
|
|
if check.status_code == 404:
|
|
break
|
|
filename = f"{date_str}-{mode}-{counter}.md"
|
|
url = f"{DREAMS_WEBDAV}/{filename}"
|
|
counter += 1
|
|
|
|
response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
|
|
response.raise_for_status()
|
|
print(f"Delivered: Journal/Dreams/{filename}")
|
|
return f"Journal/Dreams/{filename}"
|
|
|
|
def notify_sse(mode, filename):
|
|
try:
|
|
import requests
|
|
requests.post("http://localhost:8000/api/events/notify", json={
|
|
"type": "dream",
|
|
"mode": mode,
|
|
"filename": filename,
|
|
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
|
}, timeout=3)
|
|
except Exception as e:
|
|
print(f"SSE notify failed (non-critical): {e}")
|
|
|
|
# ─── State ──────────────────────────────────────────────────────────────────
|
|
|
|
def load_dreamer_state():
|
|
p = Path(DREAMER_STATE)
|
|
if p.exists():
|
|
try:
|
|
return json.loads(p.read_text())
|
|
except:
|
|
pass
|
|
return {}
|
|
|
|
def save_dreamer_state(state):
|
|
Path(DREAMER_STATE).write_text(json.dumps(state, indent=2))
|
|
|
|
# ─── Orchestrators ───────────────────────────────────────────────────────────
|
|
|
|
def write_manifest(date_str, stage_data, corpus_data):
|
|
import requests
|
|
manifest = {
|
|
"date": date_str,
|
|
"prompt_sig": prompt_signature(),
|
|
"dreamer_version": DREAMER_VERSION,
|
|
"prompt_hash": prompt_hash([
|
|
NREM_PROMPT_TEMPLATE,
|
|
EARLY_REM_PROMPT_TEMPLATE,
|
|
LATE_REM_PROMPT_TEMPLATE,
|
|
SYNTHESIS_PROMPT_TEMPLATE,
|
|
]),
|
|
"stages": stage_data,
|
|
"corpus": corpus_data,
|
|
"rating": None,
|
|
"notes": "",
|
|
}
|
|
content = json.dumps(manifest, indent=2)
|
|
auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
|
|
url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json"
|
|
try:
|
|
response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
|
|
response.raise_for_status()
|
|
print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json")
|
|
except Exception as e:
|
|
print(f"Manifest write failed — manifest not persisted: {e}")
|
|
|
|
|
|
def dream_pipeline(type_filter=None):
|
|
"""
|
|
Full nightly pipeline — interdependent stages.
|
|
NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
|
|
"""
|
|
print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
|
|
|
state = load_dreamer_state()
|
|
state.pop("retrieved_sources", None) # legacy key; session-scoped novelty now
|
|
session_retrieved = set()
|
|
|
|
delta = observe_corpus()
|
|
print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
|
|
print("Novelty: session-scoped (no across-night exclusion)")
|
|
|
|
# ── Stage 1: NREM ──────────────────────────────────────────────────────
|
|
print("\n[NREM] Retrieving...")
|
|
# NREM is replay-and-consolidation — does not exclude prior traces.
|
|
# Late REM and Early REM exclude prior content for novelty; NREM does not.
|
|
nrem_chunks = retrieve("nrem", excluded_sources=None, type_filter=type_filter)
|
|
session_retrieved.update(c["source"] for c in nrem_chunks)
|
|
# Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude
|
|
nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55}
|
|
if not nrem_chunks:
|
|
print("[NREM] No suitable chunks — aborting pipeline")
|
|
return None
|
|
|
|
print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
|
|
nrem_output = synthesize_nrem(nrem_chunks)
|
|
nrem_file = deliver(nrem_output, "nrem")
|
|
nrem_sources = [c["source"] for c in nrem_chunks]
|
|
nrem_folders = list({extract_folder(s) for s in nrem_sources})
|
|
stage_data = {
|
|
"nrem": {
|
|
"chunks_retrieved": len(nrem_chunks),
|
|
"avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
|
|
"query": "research fabrication teaching practice recent work",
|
|
"word_count": len(nrem_output.split()),
|
|
"sources": nrem_sources,
|
|
"distinct_folders": nrem_folders,
|
|
"folder_count": len(nrem_folders),
|
|
# Counter filters None: Graphiti chunks lack `type` (facts, not embeddings rows).
|
|
# Pgvector chunks always carry type post-Improvement-#2 backfill. If type
|
|
# ever appears as None here, the backfill or writer enforcement has regressed.
|
|
"type_distribution": dict(Counter(c.get("type") for c in nrem_chunks if c.get("type"))),
|
|
"status": "ok",
|
|
}
|
|
}
|
|
print(f"[NREM] Done.\n{nrem_output[:200]}...")
|
|
|
|
# ── Stage 2: Early REM — informed by NREM ──────────────────────────────
|
|
print("\n[Early REM] Retrieving...")
|
|
# Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
|
|
# Sources that scored in Early REM band during NREM remain available
|
|
early_chunks = retrieve("early-rem", excluded_sources=nrem_high_sources, type_filter=type_filter)
|
|
session_retrieved.update(c["source"] for c in early_chunks)
|
|
if not early_chunks:
|
|
print("[Early REM] No suitable chunks — skipping")
|
|
early_rem_output = nrem_output # fallback
|
|
else:
|
|
print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
|
|
early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
|
|
deliver(early_rem_output, "early-rem")
|
|
early_sources = [c["source"] for c in early_chunks]
|
|
early_folders = list({extract_folder(s) for s in early_sources})
|
|
stage_data["early_rem"] = {
|
|
"chunks_retrieved": len(early_chunks),
|
|
"avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
|
|
"query": "career decision personal change what matters next",
|
|
"word_count": len(early_rem_output.split()),
|
|
"sources": early_sources,
|
|
"distinct_folders": early_folders,
|
|
"folder_count": len(early_folders),
|
|
"type_distribution": dict(Counter(c.get("type") for c in early_chunks if c.get("type"))),
|
|
"status": "ok",
|
|
}
|
|
print(f"[Early REM] Done.\n{early_rem_output[:200]}...")
|
|
|
|
# ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
|
|
print("\n[Late REM] Retrieving...")
|
|
late_chunks = retrieve("late-rem", excluded_sources=session_retrieved, type_filter=type_filter)
|
|
session_retrieved.update(c["source"] for c in late_chunks)
|
|
if not late_chunks:
|
|
print("[Late REM] No suitable chunks — skipping")
|
|
late_rem_output = early_rem_output # fallback
|
|
else:
|
|
print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
|
|
late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
|
|
deliver(late_rem_output, "late-rem")
|
|
late_sources = [c["source"] for c in late_chunks]
|
|
late_folders = [extract_folder(s) for s in late_sources]
|
|
cross_domain_pairs = sum(
|
|
1 for i in range(len(late_folders))
|
|
for j in range(i+1, len(late_folders))
|
|
if late_folders[i] != late_folders[j]
|
|
)
|
|
stage_data["late_rem"] = {
|
|
"chunks_retrieved": len(late_chunks),
|
|
"avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
|
|
"query": "practice place memory making",
|
|
"word_count": len(late_rem_output.split()),
|
|
"sources": late_sources,
|
|
"distinct_folders": list(set(late_folders)),
|
|
"folder_count": len(set(late_folders)),
|
|
"cross_domain_pairs": cross_domain_pairs,
|
|
"type_distribution": dict(Counter(c.get("type") for c in late_chunks if c.get("type"))),
|
|
"status": "ok",
|
|
}
|
|
print(f"[Late REM] Done.\n{late_rem_output[:200]}...")
|
|
|
|
# ── Stage 4: Synthesis — all three stages ─────────────────────────────
|
|
print("\n[Synthesis] Integrating all stages...")
|
|
synthesis_output = synthesize_final(nrem_output, early_rem_output, late_rem_output)
|
|
synthesis_file = deliver(synthesis_output, "synthesis")
|
|
stage_data["synthesis"] = {
|
|
"word_count": len(synthesis_output.split()),
|
|
"status": "ok",
|
|
}
|
|
|
|
print(f"\n{'='*60}")
|
|
print("SYNTHESIS:")
|
|
print(synthesis_output)
|
|
print(f"{'='*60}")
|
|
|
|
# Write manifest
|
|
all_session_sources = list(session_retrieved)
|
|
all_session_folders = list({extract_folder(s) for s in all_session_sources})
|
|
total_chunks = 0
|
|
pg = None
|
|
try:
|
|
pg = get_pg()
|
|
cur = pg.cursor()
|
|
cur.execute("SELECT COUNT(*) FROM embeddings")
|
|
total_chunks = cur.fetchone()[0]
|
|
except Exception as e:
|
|
print(f"total_chunks query failed (non-critical): {e}")
|
|
finally:
|
|
if pg is not None:
|
|
pg.close()
|
|
corpus_data = {
|
|
"total_chunks": total_chunks,
|
|
"new_chunks_since_last_dream": delta.get("new_chunks", 0),
|
|
"days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
|
|
"substrate": "pgvector",
|
|
"aggregate": {
|
|
"total_distinct_sources": len(all_session_sources),
|
|
"total_distinct_folders": len(all_session_folders),
|
|
"folders_touched": all_session_folders,
|
|
}
|
|
}
|
|
write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)
|
|
|
|
# Update state and notify (reuse state from start of pipeline; legacy key already popped)
|
|
state["last_dream_timestamp"] = datetime.now().timestamp()
|
|
state["last_dream_mode"] = "pipeline"
|
|
state["last_dream_file"] = synthesis_file
|
|
|
|
save_dreamer_state(state)
|
|
|
|
notify_sse("synthesis", synthesis_file.split("/")[-1])
|
|
print(f"\nPipeline complete. Synthesis: {synthesis_file}")
|
|
return synthesis_file
|
|
|
|
|
|
def dream_lucid(task, type_filter=None):
|
|
"""On-demand lucid dream — single mode, used by Dream Now in settings."""
|
|
print(f"Lucid dream starting — task: {task[:80] if task else 'none'}")
|
|
chunks = retrieve("lucid", task=task, type_filter=type_filter)
|
|
if not chunks:
|
|
print("No suitable chunks — aborting")
|
|
return None
|
|
print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
|
|
output = synthesize_lucid(chunks, task)
|
|
filepath = deliver(output, "lucid", task=task)
|
|
|
|
state = load_dreamer_state()
|
|
state["last_dream_timestamp"] = datetime.now().timestamp()
|
|
state["last_dream_mode"] = "lucid"
|
|
state["last_dream_file"] = filepath
|
|
save_dreamer_state(state)
|
|
|
|
notify_sse("lucid", filepath.split("/")[-1])
|
|
print(f"\n{'='*60}")
|
|
print(output)
|
|
print(f"{'='*60}")
|
|
print(f"\nDelivered to {filepath}")
|
|
return filepath
|
|
|
|
|
|
def dream_single(mode, task=None, type_filter=None):
|
|
"""
|
|
Single mode — used by Dream Now for non-lucid modes.
|
|
Runs one stage independently (for testing/tuning individual stages).
|
|
"""
|
|
print(f"Single mode dream: {mode}")
|
|
chunks = retrieve(mode, task=task, type_filter=type_filter)
|
|
if not chunks:
|
|
print("No suitable chunks — aborting")
|
|
return None
|
|
print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
|
|
|
|
if mode == "nrem":
|
|
output = synthesize_nrem(chunks)
|
|
elif mode == "early-rem":
|
|
output = synthesize_early_rem(chunks, "")
|
|
elif mode == "late-rem":
|
|
output = synthesize_late_rem(chunks, "", "")
|
|
else:
|
|
output = synthesize_lucid(chunks, task)
|
|
|
|
filepath = deliver(output, mode, task=task)
|
|
|
|
state = load_dreamer_state()
|
|
state["last_dream_timestamp"] = datetime.now().timestamp()
|
|
state["last_dream_mode"] = mode
|
|
state["last_dream_file"] = filepath
|
|
save_dreamer_state(state)
|
|
|
|
notify_sse(mode, filepath.split("/")[-1])
|
|
print(f"\n{'='*60}")
|
|
print(output)
|
|
print(f"{'='*60}")
|
|
print(f"\nDelivered to {filepath}")
|
|
return filepath
|
|
|
|
|
|
# ─── CLI ────────────────────────────────────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Aaron AI Dreamer")
|
|
parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"])
|
|
parser.add_argument("--task", type=str)
|
|
parser.add_argument(
|
|
"--type-filter", type=str, default=None,
|
|
help="Comma-separated embeddings.type allowlist (e.g. 'document,aaronai_conversation'). "
|
|
"Applies to pgvector retrieval only; Graphiti chunks are not filtered. "
|
|
"Experimental — default is no filter, no behavior change.",
|
|
)
|
|
args = parser.parse_args()
|
|
type_filter = [t.strip() for t in args.type_filter.split(",")] if args.type_filter else None
|
|
|
|
if args.mode == "lucid":
|
|
dream_lucid(args.task or "What should I be thinking about that I am not?", type_filter=type_filter)
|
|
elif args.mode and args.mode != "pipeline":
|
|
dream_single(args.mode, args.task, type_filter=type_filter)
|
|
else:
|
|
# Default: full pipeline
|
|
dream_pipeline(type_filter=type_filter)
|