aaronAI/scripts/dream.py.bak.20260501-002209

"""
Aaron AI Dreamer — Active Inference Engine
Interdependent stage architecture grounded in sleep consolidation research.

Nightly pipeline: NREM → Early REM → Late REM → Synthesis
Each stage receives the previous stage's output as context.
Lucid mode is on-demand only (Dream Now from settings).

Research basis:
- Singh et al. PNAS 2022: alternating NREM/REM outperforms single-stage approaches
- Klinzing et al. Nature Neuroscience 2019: SO-spindle-ripple coupling is interdependent
- REM operates on what NREM produced — stages are not discrete alternatives
"""

import os
import json
import sqlite3
import argparse
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv
import psycopg2
import hashlib

load_dotenv(Path.home() / "aaronai" / ".env", override=True)

PG_DSN = os.getenv("PG_DSN")

def get_pg():
    return psycopg2.connect(PG_DSN)

# ─── Paths ──────────────────────────────────────────────────────────────────
CONVERSATIONS_DB = str(Path.home() / "aaronai" / "conversations.db")
WATCHER_STATE    = str(Path.home() / "aaronai" / "watcher_state.json")
DREAMER_STATE    = str(Path.home() / "aaronai" / "dreamer_state.json")
JOURNAL_DIR      = "/home/aaron/nextcloud/data/data/aaron/files/Journal/Daily"

NEXTCLOUD_URL      = os.getenv("NEXTCLOUD_URL", "https://nextcloud.aaronnelson.studio")
NEXTCLOUD_USER     = os.getenv("NEXTCLOUD_USER", "aaron")
NEXTCLOUD_PASSWORD = os.getenv("NEXTCLOUD_PASSWORD", "")
DREAMS_WEBDAV      = f"{NEXTCLOUD_URL}/remote.php/dav/files/{NEXTCLOUD_USER}/Journal/Dreams"

# Similarity ranges calibrated for all-MiniLM-L6-v2
MODE_RANGES = {
    "nrem":      (0.48, 0.72),
    "early-rem": (0.38, 0.55),
    "late-rem":  (0.22, 0.42),
    "lucid":     (0.32, 0.72),
}
DREAMER_VERSION = "1.1"  # 1.0=original exclusion logic; 1.1=score-band exclusion

# ─── Prompt versioning ──────────────────────────────────────────────────────
# Bump the relevant constant manually when changing a prompt.
PROMPT_VERSION_NREM = "1.0"
PROMPT_VERSION_EREM = "1.1"
PROMPT_VERSION_LREM = "1.2"
PROMPT_VERSION_SYN  = "1.0"

def prompt_signature():
    return (f"nrem={PROMPT_VERSION_NREM}|erem={PROMPT_VERSION_EREM}"
            f"|lrem={PROMPT_VERSION_LREM}|syn={PROMPT_VERSION_SYN}")

def prompt_hash(prompts: list[str]) -> str:
    combined = "".join(prompts)
    return hashlib.md5(combined.encode()).hexdigest()[:8]

def extract_folder(source_path):
    """Extract top-level Nextcloud folder from source path."""
    parts = source_path.replace("\\", "/").split("/")
    return parts[0] if parts else "unknown"

# ─── Stage 1: Observe ───────────────────────────────────────────────────────

def observe_corpus():
    state = load_dreamer_state()
    last_dream = state.get("last_dream_timestamp", 0)
    new_chunk_count = 0
    try:
        watcher_state = json.loads(Path(WATCHER_STATE).read_text())
        for path, mtime in watcher_state.items():
            if float(mtime) > last_dream:
                new_chunk_count += 1
    except:
        pass
    days_since = (datetime.now().timestamp() - last_dream) / 86400
    recent_topics = get_recent_conversation_topics()
    return {
        "new_chunks": new_chunk_count,
        "days_since_dream": days_since,
        "recent_topics": recent_topics,
        "last_dream": last_dream,
    }

def get_recent_conversation_topics(days=14):
    try:
        conn = sqlite3.connect(CONVERSATIONS_DB)
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()
        c = conn.cursor()
        c.execute("""
            SELECT m.content FROM messages m
            JOIN conversations c ON m.conversation_id = c.id
            WHERE m.role = 'user' AND c.updated_at > ?
            ORDER BY m.timestamp DESC LIMIT 20
        """, (cutoff,))
        rows = c.fetchall()
        conn.close()
        return [r[0][:200] for r in rows]
    except:
        return []

# ─── Stage 2: Retrieve ──────────────────────────────────────────────────────


def retrieve_graphiti(mode, task=None, n_results=8):
    """E3 experiment — Graphiti substrate retrieval.
    Queries Graphiti /search endpoint instead of pgvector.
    Returns chunks in same format as retrieve() for pipeline compatibility.
    Note: content is Graphiti facts (synthesized relationships), not raw chunks.
    """
    import requests as req_lib
    if task:
        query = task
    elif mode == "late-rem":
        delta = observe_corpus()
        topics = delta.get("recent_topics", [])
        query = topics[0] if topics else "practice place memory making"
    elif mode == "early-rem":
        query = "career decision personal change what matters next"
    else:
        query = "research fabrication teaching practice recent work"

    try:
        resp = req_lib.get(
            "http://localhost:8001/search",
            params={"query": query, "limit": n_results, "group_id": "aaron"},
            timeout=30,
        )
        resp.raise_for_status()
        results = resp.json().get("results", [])
        chunks = []
        for r in results:
            fact = r.get("fact", "")
            if not fact.strip():
                continue
            chunks.append({
                "source": r.get("source", "graphiti"),
                "content": fact,
                "relevance": r.get("score", 0.5),
                "similarity": r.get("score", 0.5),
            })
        return chunks
    except Exception as e:
        print(f"[Graphiti retrieval error: {e}] — falling back to empty.")
        return []

def retrieve(mode, task=None, n_results=8, excluded_sources=None):
    # E3 experiment: DREAMER_SUBSTRATE=graphiti routes retrieval to Graphiti /search
    # Default behavior: pgvector similarity search (unchanged)
    substrate = os.getenv("DREAMER_SUBSTRATE", "pgvector")
    if substrate == "graphiti":
        return retrieve_graphiti(mode, task=task, n_results=n_results)
    from sentence_transformers import SentenceTransformer
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    low, high = MODE_RANGES[mode]

    if task:
        query = task
    elif mode == "late-rem":
        delta = observe_corpus()
        topics = delta.get("recent_topics", [])
        query = topics[0] if topics else "practice place memory making"
    elif mode == "early-rem":
        query = "career decision personal change what matters next"
    else:
        query = "research fabrication teaching practice recent work"

    embedding = embedder.encode([query]).tolist()[0]
    chunks = []
    seen_sources = set()

    try:
        pg = get_pg()
        cur = pg.cursor()
        excluded_sources = excluded_sources or set()
        if excluded_sources:
            cur.execute("""
                SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
                FROM embeddings
                WHERE source NOT IN %s
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """, (embedding, tuple(excluded_sources), embedding, n_results * 3))
        else:
            cur.execute("""
                SELECT document, source, 1 - (embedding <=> %s::vector) as similarity
                FROM embeddings
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """, (embedding, embedding, n_results * 3))

        for doc, source, similarity in cur.fetchall():
            if not (low <= similarity <= high):
                continue
            if source in seen_sources:
                continue
            chunks.append({
                "source": source or "unknown",
                "content": doc,
                "relevance": similarity,
                "similarity": similarity,
            })
            seen_sources.add(source)
            if len(chunks) >= n_results:
                break
        pg.close()
    except Exception as e:
        print(f"pgvector retrieval error: {e}")

    return chunks

# ─── Stage 3: Synthesize ────────────────────────────────────────────────────

def synthesize_nrem(chunks):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""You have read everything Aaron Nelson has written and published.
You are a careful colleague who noticed something this week.

Here is material from his corpus:

{chunk_text}

Write to Aaron directly. Identify one specific connection between
this material and something he wrote or worked on previously.
Stay close to the documents — cite them specifically by name.
Do not speculate beyond what the material supports. Do not use
headers or bullet points. Write one paragraph of 200-300 words
that ends with a single concrete question he could act on."""
    return _call_claude(prompt)


def synthesize_early_rem(chunks, nrem_output):
    # v1.1 — removed citation instruction, removed close-friend persona,
    # shifted register from analysis to recognition.
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""Something was noticed earlier tonight, moving through Aaron's recent work:

{nrem_output}

That observation is still with you. Now here is material from a different
time — pulled from further back, from different parts of his corpus:

{chunk_text}

You are not analyzing. You are recognizing.

Something in the earlier observation and something in this older material
are the same thing wearing different clothes. Find it. Don't explain why
they're connected — just let the connection speak. Write from inside the
recognition, not from above it.

The emotional register underneath the career logic is more interesting
than the career logic. The pattern that has been repeating longer than
he has been aware of it is more interesting than the current instance.

Write directly to Aaron. No citations, no references, no analysis.
First person, present tense. Let what you noticed arrive rather than
be delivered. 150-250 words. End with one thing that is true that
he probably already knows but hasn't said out loud yet."""
    return _call_claude(prompt)


def synthesize_late_rem(chunks, nrem_output, early_rem_output):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""You have been moving through Aaron Nelson's corpus all night.
First you found this, in the careful light of early consolidation:

{nrem_output}

Then, in the more personal territory that followed:

{early_rem_output}

Now it is late. The boundaries between things have loosened.
Here is material pulled from opposite ends of his work:

{chunk_text}

Do not explain the connections between all of this.
Do not resolve them. Do not summarize what came before.
Something stranger is possible now — let the accumulated
material from the night find its own shape. Compressed,
associative, slightly off. Let the strangeness stand.

No headers. No bullet points. No hedging. No resolution.
No offer. End mid-thought if that is where the material ends.
150-250 words."""
    return _call_claude(prompt)


def synthesize_final(nrem_output, early_rem_output, late_rem_output):
    prompt = f"""You have spent the night moving through Aaron Nelson's corpus
in three passes, each building on the last.

The first pass — careful, close to the documents:
{nrem_output}

The second pass — more personal, following what the first opened:
{early_rem_output}

The third pass — associative, strange, letting things touch that
don't normally touch:
{late_rem_output}

Now synthesize. Not a summary — a synthesis. Find what runs through
all three that none of them said directly. The thing that only becomes
visible when you hold all three passes together.

Write it as a single unbroken piece. No headers, no bullet points,
no stage labels. 200-300 words. End with the one question that
matters most right now."""
    return _call_claude(prompt, max_tokens=800)


def synthesize_lucid(chunks, task):
    chunk_text = "\n\n---\n\n".join([f"[{c['source']}]\n{c['content']}" for c in chunks])
    prompt = f"""Aaron has a question he is sitting with:

{task or "What should I be thinking about that I am not?"}

You have searched his entire corpus and found material that
speaks to this question from unexpected directions. Here is
what you found:

{chunk_text}

Do not summarize. Do not list. Pick the most interesting
tension between what the corpus contains and what he is
asking, and follow it through to its conclusion. Cite
specific documents by name. Be direct about what you think.
No headers, no bullet points. 250-400 words.
End with an offer to work on it together."""
    return _call_claude(prompt)


def _call_claude(prompt, max_tokens=1000):
    import anthropic
    client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=max_tokens,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

# ─── Stage 4: Deliver ───────────────────────────────────────────────────────

def deliver(dream_text, mode, task=None):
    import requests
    date_str = datetime.now().strftime("%Y-%m-%d")
    filename = f"{date_str}-{mode}.md"
    header = f"# Dream — {mode.upper()} — {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
    header += f"*prompt_sig: {prompt_signature()}*\n\n"
    if task:
        header += f"*Task: {task}*\n\n"
    header += "---\n\n"
    content = header + dream_text

    auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
    requests.request("MKCOL", DREAMS_WEBDAV, auth=auth, timeout=10)

    url = f"{DREAMS_WEBDAV}/{filename}"
    counter = 1
    while True:
        check = requests.request("PROPFIND", url, auth=auth, timeout=10)
        if check.status_code == 404:
            break
        filename = f"{date_str}-{mode}-{counter}.md"
        url = f"{DREAMS_WEBDAV}/{filename}"
        counter += 1

    response = requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
    response.raise_for_status()
    print(f"Delivered: Journal/Dreams/{filename}")
    return f"Journal/Dreams/{filename}"

def notify_sse(mode, filename):
    try:
        import requests
        requests.post("http://localhost:8000/api/events/notify", json={
            "type": "dream",
            "mode": mode,
            "filename": filename,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
        }, timeout=3)
    except Exception as e:
        print(f"SSE notify failed (non-critical): {e}")

# ─── State ──────────────────────────────────────────────────────────────────

def load_dreamer_state():
    p = Path(DREAMER_STATE)
    if p.exists():
        try:
            return json.loads(p.read_text())
        except:
            pass
    return {}

def save_dreamer_state(state):
    Path(DREAMER_STATE).write_text(json.dumps(state, indent=2))

# ─── Orchestrators ───────────────────────────────────────────────────────────

def write_manifest(date_str, stage_data, corpus_data):
    import requests
    manifest = {
        "date": date_str,
        "prompt_sig": prompt_signature(),
        "dreamer_version": DREAMER_VERSION,
        "prompt_hash": prompt_hash([
            synthesize_nrem.__doc__ or "",
            synthesize_early_rem.__doc__ or "",
            synthesize_late_rem.__doc__ or "",
            synthesize_final.__doc__ or "",
        ]),
        "stages": stage_data,
        "corpus": corpus_data,
        "rating": None,
        "notes": "",
    }
    content = json.dumps(manifest, indent=2)
    auth = (NEXTCLOUD_USER, NEXTCLOUD_PASSWORD)
    url = f"{DREAMS_WEBDAV}/dream-manifest-{date_str}.json"
    try:
        requests.put(url, data=content.encode("utf-8"), auth=auth, timeout=30)
        print(f"Manifest written: Journal/Dreams/dream-manifest-{date_str}.json")
    except Exception as e:
        print(f"Manifest write failed (non-critical): {e}")


def dream_pipeline():
    """
    Full nightly pipeline — interdependent stages.
    NREM output feeds Early REM. Both feed Late REM. All three feed Synthesis.
    """
    print(f"Dreamer pipeline starting — {datetime.now().strftime('%Y-%m-%d %H:%M')}")

    state = load_dreamer_state()
    previously_retrieved = set(state.get("retrieved_sources", []))
    session_retrieved = set()

    delta = observe_corpus()
    print(f"Corpus: {delta['new_chunks']} new chunks, {delta['days_since_dream']:.1f} days since last dream")
    print(f"Excluding {len(previously_retrieved)} previously retrieved sources")

    # ── Stage 1: NREM ──────────────────────────────────────────────────────
    print("\n[NREM] Retrieving...")
    nrem_chunks = retrieve("nrem", excluded_sources=previously_retrieved | session_retrieved)
    session_retrieved.update(c["source"] for c in nrem_chunks)
    # Track sources that scored above Early REM ceiling — these are the only ones Early REM should exclude
    nrem_high_sources = {c["source"] for c in nrem_chunks if c["similarity"] > 0.55}
    if not nrem_chunks:
        print("[NREM] No suitable chunks — aborting pipeline")
        return None

    print(f"[NREM] Retrieved {len(nrem_chunks)} chunks. Synthesizing...")
    nrem_output = synthesize_nrem(nrem_chunks)
    nrem_file = deliver(nrem_output, "nrem")
    nrem_sources = [c["source"] for c in nrem_chunks]
    nrem_folders = list({extract_folder(s) for s in nrem_sources})
    stage_data = {
        "nrem": {
            "chunks_retrieved": len(nrem_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in nrem_chunks) / len(nrem_chunks), 3),
            "query": "research fabrication teaching practice recent work",
            "word_count": len(nrem_output.split()),
            "sources": nrem_sources,
            "distinct_folders": nrem_folders,
            "folder_count": len(nrem_folders),
            "status": "ok",
        }
    }
    print(f"[NREM] Done.\n{nrem_output[:200]}...")

    # ── Stage 2: Early REM — informed by NREM ──────────────────────────────
    print("\n[Early REM] Retrieving...")
    # Early REM excludes previously retrieved + NREM high-scorers only (not full session_retrieved)
    # Sources that scored in Early REM band during NREM remain available
    early_chunks = retrieve("early-rem", excluded_sources=previously_retrieved | nrem_high_sources)
    session_retrieved.update(c["source"] for c in early_chunks)
    if not early_chunks:
        print("[Early REM] No suitable chunks — skipping")
        early_rem_output = nrem_output  # fallback
    else:
        print(f"[Early REM] Retrieved {len(early_chunks)} chunks. Synthesizing with NREM context...")
        early_rem_output = synthesize_early_rem(early_chunks, nrem_output)
        deliver(early_rem_output, "early-rem")
        early_sources = [c["source"] for c in early_chunks]
        early_folders = list({extract_folder(s) for s in early_sources})
        stage_data["early_rem"] = {
            "chunks_retrieved": len(early_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in early_chunks) / len(early_chunks), 3),
            "query": "career decision personal change what matters next",
            "word_count": len(early_rem_output.split()),
            "sources": early_sources,
            "distinct_folders": early_folders,
            "folder_count": len(early_folders),
            "status": "ok",
        }
        print(f"[Early REM] Done.\n{early_rem_output[:200]}...")

    # ── Stage 3: Late REM — informed by NREM + Early REM ──────────────────
    print("\n[Late REM] Retrieving...")
    late_chunks = retrieve("late-rem", excluded_sources=previously_retrieved | session_retrieved)
    session_retrieved.update(c["source"] for c in late_chunks)
    if not late_chunks:
        print("[Late REM] No suitable chunks — skipping")
        late_rem_output = early_rem_output  # fallback
    else:
        print(f"[Late REM] Retrieved {len(late_chunks)} chunks. Synthesizing with full context...")
        late_rem_output = synthesize_late_rem(late_chunks, nrem_output, early_rem_output)
        deliver(late_rem_output, "late-rem")
        late_sources = [c["source"] for c in late_chunks]
        late_folders = [extract_folder(s) for s in late_sources]
        cross_domain_pairs = sum(
            1 for i in range(len(late_folders))
            for j in range(i+1, len(late_folders))
            if late_folders[i] != late_folders[j]
        )
        stage_data["late_rem"] = {
            "chunks_retrieved": len(late_chunks),
            "avg_similarity": round(sum(c["relevance"] for c in late_chunks) / len(late_chunks), 3),
            "query": "practice place memory making",
            "word_count": len(late_rem_output.split()),
            "sources": late_sources,
            "distinct_folders": list(set(late_folders)),
            "folder_count": len(set(late_folders)),
            "cross_domain_pairs": cross_domain_pairs,
            "status": "ok",
        }
        print(f"[Late REM] Done.\n{late_rem_output[:200]}...")

    # ── Stage 4: Synthesis — all three stages ─────────────────────────────
    print("\n[Synthesis] Integrating all stages...")
    synthesis_output = synthesize_final(nrem_output, early_rem_output, late_rem_output)
    synthesis_file = deliver(synthesis_output, "synthesis")
    stage_data["synthesis"] = {
        "word_count": len(synthesis_output.split()),
        "status": "ok",
    }

    print(f"\n{'='*60}")
    print("SYNTHESIS:")
    print(synthesis_output)
    print(f"{'='*60}")

    # Write manifest
    all_session_sources = list(session_retrieved)
    all_session_folders = list({extract_folder(s) for s in all_session_sources})
    corpus_data = {
        "total_chunks": delta.get("new_chunks", 0),
        "new_chunks_since_last_dream": delta.get("new_chunks", 0),
        "days_since_last_dream": round(delta.get("days_since_dream", 0), 2),
        "substrate": "pgvector",
        "aggregate": {
            "total_distinct_sources": len(all_session_sources),
            "total_distinct_folders": len(all_session_folders),
            "folders_touched": all_session_folders,
        }
    }
    write_manifest(datetime.now().strftime("%Y-%m-%d"), stage_data, corpus_data)

    # Update state and notify
    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = "pipeline"
    state["last_dream_file"] = synthesis_file

    # Accumulate retrieved sources across nights. Cap at 500, trim to 400 on overflow.
    all_retrieved = list(previously_retrieved | session_retrieved)
    if len(all_retrieved) > 500:
        all_retrieved = all_retrieved[-400:]
    state["retrieved_sources"] = all_retrieved

    save_dreamer_state(state)

    notify_sse("synthesis", synthesis_file.split("/")[-1])
    print(f"\nPipeline complete. Synthesis: {synthesis_file}")
    return synthesis_file


def dream_lucid(task):
    """On-demand lucid dream — single mode, used by Dream Now in settings."""
    print(f"Lucid dream starting — task: {task[:80] if task else 'none'}")
    chunks = retrieve("lucid", task=task)
    if not chunks:
        print("No suitable chunks — aborting")
        return None
    print(f"Retrieved {len(chunks)} chunks. Synthesizing...")
    output = synthesize_lucid(chunks, task)
    filepath = deliver(output, "lucid", task=task)

    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = "lucid"
    state["last_dream_file"] = filepath
    save_dreamer_state(state)

    notify_sse("lucid", filepath.split("/")[-1])
    print(f"\n{'='*60}")
    print(output)
    print(f"{'='*60}")
    print(f"\nDelivered to {filepath}")
    return filepath


def dream_single(mode, task=None):
    """
    Single mode — used by Dream Now for non-lucid modes.
    Runs one stage independently (for testing/tuning individual stages).
    """
    print(f"Single mode dream: {mode}")
    chunks = retrieve(mode, task=task)
    if not chunks:
        print("No suitable chunks — aborting")
        return None
    print(f"Retrieved {len(chunks)} chunks. Synthesizing...")

    if mode == "nrem":
        output = synthesize_nrem(chunks)
    elif mode == "early-rem":
        output = synthesize_early_rem(chunks, "")
    elif mode == "late-rem":
        output = synthesize_late_rem(chunks, "", "")
    else:
        output = synthesize_lucid(chunks, task)

    filepath = deliver(output, mode, task=task)

    state = load_dreamer_state()
    state["last_dream_timestamp"] = datetime.now().timestamp()
    state["last_dream_mode"] = mode
    state["last_dream_file"] = filepath
    save_dreamer_state(state)

    notify_sse(mode, filepath.split("/")[-1])
    print(f"\n{'='*60}")
    print(output)
    print(f"{'='*60}")
    print(f"\nDelivered to {filepath}")
    return filepath


# ─── CLI ────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Aaron AI Dreamer")
    parser.add_argument("--mode", choices=["nrem", "early-rem", "late-rem", "lucid", "pipeline"])
    parser.add_argument("--task", type=str)
    args = parser.parse_args()

    if args.mode == "lucid":
        dream_lucid(args.task or "What should I be thinking about that I am not?")
    elif args.mode and args.mode != "pipeline":
        dream_single(args.mode, args.task)
    else:
        # Default: full pipeline
        dream_pipeline()